1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
|
#! /usr/bin/env python
#-*- coding: utf-8 -*-
# pyAggr3g470r - A Web based news aggregator.
# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/
#
# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>
__author__ = "Cedric Bonhomme"
__version__ = "$Revision: 0.3 $"
__date__ = "$Date: 2013/06/24 $"
__revision__ = "$Date: 2013/11/10 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "GPLv3"
import os
from whoosh.index import create_in, open_dir
from whoosh.index import EmptyIndexError
from whoosh.fields import *
from whoosh.query import *
from whoosh.qparser import QueryParser
from whoosh.writing import AsyncWriter
import conf
import utils
import models
indexdir = "./pyaggr3g470r/var/indexdir"
schema = Schema(title=TEXT(stored=True), \
content=TEXT, \
article_id=TEXT(stored=True), \
feed_id=TEXT(stored=True))
def create_index():
"""
Creates the index.
"""
feeds = models.Feed.objects()
if not os.path.exists(indexdir):
os.mkdir(indexdir)
ix = create_in(indexdir, schema)
writer = ix.writer()
for feed in feeds:
for article in feed.articles:
writer.add_document(title=article.title, \
content=utils.clear_string(article.content), \
article_id=str(article.id).decode(), \
feed_id=str(feed.oid).decode())
writer.commit()
def add_to_index(articles, feed):
"""
Add a list of articles to the index.
Here an AsyncWriter is used because the function will
be called in multiple threads by the feedgetter module.
"""
try:
ix = open_dir(indexdir)
except (EmptyIndexError, OSError) as e:
if not os.path.exists(indexdir):
os.mkdir(indexdir)
ix = create_in(indexdir, schema)
writer = AsyncWriter(ix)
for article in articles:
writer.add_document(title=article.title, \
content=utils.clear_string(article.content), \
article_id=str(article.id).decode(), \
feed_id=str(feed.oid).decode())
writer.commit()
def delete_article(feed_id, article_id):
"""
Delete an article from the index.
"""
try:
ix = open_dir(indexdir)
except (EmptyIndexError, OSError) as e:
raise EmptyIndexError
writer = ix.writer()
document = And([Term("feed_id", feed_id), Term("article_id", article_id)])
writer.delete_by_query(document)
writer.commit()
def search(term):
"""
Search for `term` in the index.
Returns a list of articles.
"""
try:
ix = open_dir(indexdir)
except (EmptyIndexError, OSError) as e:
raise EmptyIndexError
with ix.searcher() as searcher:
query = QueryParser("content", ix.schema).parse(term)
results = searcher.search(query, limit=None)
return [(article["feed_id"], article["article_id"]) for article in results]
def nb_documents():
"""
Return the number of undeleted documents.
"""
try:
ix = open_dir(indexdir)
except (EmptyIndexError, OSError) as e:
raise EmptyIndexError
return ix.doc_count()
if __name__ == "__main__":
# Point of entry in execution mode.
#create_index()
print(nb_documents())
results = search("Nothomb")
for article in results:
print(article)
|