aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/search.py
diff options
context:
space:
mode:
Diffstat (limited to 'pyaggr3g470r/search.py')
-rw-r--r--pyaggr3g470r/search.py128
1 files changed, 128 insertions, 0 deletions
diff --git a/pyaggr3g470r/search.py b/pyaggr3g470r/search.py
new file mode 100644
index 00000000..0b4d33b6
--- /dev/null
+++ b/pyaggr3g470r/search.py
@@ -0,0 +1,128 @@
+#! /usr/bin/env python
+#-*- coding: utf-8 -*-
+
+# pyAggr3g470r - A Web based news aggregator.
+# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/
+#
+# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 0.2 $"
+__date__ = "$Date: 2013/06/24 $"
+__revision__ = "$Date: 2013/06/25 $"
+__copyright__ = "Copyright (c) Cedric Bonhomme"
+__license__ = "GPLv3"
+
+import os
+
+from whoosh.index import create_in, open_dir
+from whoosh.index import EmptyIndexError
+from whoosh.fields import *
+from whoosh.query import *
+from whoosh.qparser import QueryParser
+from whoosh.writing import AsyncWriter
+
+import conf
+import utils
+
+indexdir = "./var/indexdir"
+
+schema = Schema(title=TEXT(stored=True), \
+ content=TEXT, \
+ article_id=TEXT(stored=True), \
+ feed_id=TEXT(stored=True))
+
+def create_index():
+ """
+ Creates the index.
+ """
+ mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \
+ conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD)
+ feeds = mongo.get_all_feeds()
+ if not os.path.exists(indexdir):
+ os.mkdir(indexdir)
+ ix = create_in(indexdir, schema)
+ writer = ix.writer()
+ for feed in feeds:
+ for article in mongo.get_articles(feed["feed_id"]):
+ writer.add_document(title=article["article_title"], \
+ content=utils.clear_string(article["article_content"]), \
+ article_id=article["article_id"] , \
+ feed_id=feed["feed_id"])
+ writer.commit()
+
+def add_to_index(articles, feed):
+ """
+ Add a list of articles to the index.
+ Here an AsyncWriter is used because the function will
+ be called in multiple threads by the feedgetter module.
+ """
+ try:
+ ix = open_dir(indexdir)
+ except (EmptyIndexError, OSError) as e:
+ raise EmptyIndexError
+ writer = AsyncWriter(ix)
+ for article in articles:
+ writer.add_document(title=article["article_title"], \
+ content=utils.clear_string(article["article_content"]), \
+ article_id=article["article_id"] , \
+ feed_id=feed["feed_id"])
+ writer.commit()
+
+def delete_article(feed_id, article_id):
+ """
+ Delete an article from the index.
+ """
+ try:
+ ix = open_dir(indexdir)
+ except (EmptyIndexError, OSError) as e:
+ raise EmptyIndexError
+ writer = ix.writer()
+ document = And([Term("feed_id", feed_id), Term("article_id", article_id)])
+ writer.delete_by_query(document)
+ writer.commit()
+
+def search(term):
+ """
+ Search for `term` in the index.
+ Returns a list of articles.
+ """
+ try:
+ ix = open_dir(indexdir)
+ except (EmptyIndexError, OSError) as e:
+ raise EmptyIndexError
+ with ix.searcher() as searcher:
+ query = QueryParser("content", ix.schema).parse(term)
+ results = searcher.search(query, limit=None)
+ return [(article["feed_id"], article["article_id"]) for article in results]
+
+def nb_documents():
+ """
+ Return the number of undeleted documents.
+ """
+ try:
+ ix = open_dir(indexdir)
+ except (EmptyIndexError, OSError) as e:
+ raise EmptyIndexError
+ return ix.doc_count()
+
+if __name__ == "__main__":
+ # Point of entry in execution mode.
+ #create_index()
+ print(nb_documents())
+ results = search("Nothomb")
+ for article in results:
+ print(article)
bgstack15