From 4c466b3af02063c96675b2fa3fe045b9030d8152 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Sun, 10 Nov 2013 21:49:23 +0100 Subject: Whoosh indexing. --- pyaggr3g470r/feedgetter.py | 22 ++++++++++------------ pyaggr3g470r/search.py | 35 ++++++++++++++++++----------------- 2 files changed, 28 insertions(+), 29 deletions(-) diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py index 850f4449..21cc841d 100644 --- a/pyaggr3g470r/feedgetter.py +++ b/pyaggr3g470r/feedgetter.py @@ -20,19 +20,17 @@ # along with this program. If not, see __author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.9 $" +__version__ = "$Revision: 2.0 $" __date__ = "$Date: 2010/09/02 $" -__revision__ = "$Date: 2013/11/05 $" +__revision__ = "$Date: 2013/11/10 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" -import hashlib import threading import feedparser from BeautifulSoup import BeautifulSoup from datetime import datetime -from contextlib import contextmanager import models import conf @@ -81,7 +79,7 @@ class FeedGetter(object): def process(self, feed): """ - Comment + Retrieves articles form the feed and add them to the database. """ #a_feed = feedparser.parse(feed_link, handlers = [self.proxy]) a_feed = feedparser.parse(feed.link) @@ -118,21 +116,21 @@ class FeedGetter(object): except: post_date = datetime(*article.updated_parsed[:6]) + # save the article article = models.Article(post_date, article.link, article_title, description, False, False) article.save() articles.append(article) - """ # add the article to the Whoosh index try: search.add_to_index([article], feed) - except: - print("Whoosh error.") + except Exception as e: + print("Whoosh error: " + str(e)) #pyaggr3g470r_log.error("Whoosh error.") - continue""" + continue + # email notification if conf.MAIL_ENABLED and feed.email_notification: - # if subscribed to the feed with app.app_context(): msg = Message('[pyAggr3g470r] ' + feed.title + ' : ' + article.title, \ sender = conf.MAIL_FROM, recipients = [conf.MAIL_TO]) @@ -140,9 +138,9 @@ class FeedGetter(object): msg.html = description mail.send(msg) + # add the articles to the list of articles for the current feed feed.articles.extend(articles) feed.articles = sorted(feed.articles, key=lambda t: t.date, reverse=True) - #feed.save() self.user.save() @@ -150,4 +148,4 @@ if __name__ == "__main__": # Point of entry in execution mode feed_getter = FeedGetter() # Retrieve all feeds - feed_getter.retrieve_feed() + feed_getter.retrieve_feed() \ No newline at end of file diff --git a/pyaggr3g470r/search.py b/pyaggr3g470r/search.py index 0b4d33b6..afb1b6ab 100644 --- a/pyaggr3g470r/search.py +++ b/pyaggr3g470r/search.py @@ -20,9 +20,9 @@ # along with this program. If not, see __author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.2 $" +__version__ = "$Revision: 0.3 $" __date__ = "$Date: 2013/06/24 $" -__revision__ = "$Date: 2013/06/25 $" +__revision__ = "$Date: 2013/11/10 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" @@ -37,8 +37,9 @@ from whoosh.writing import AsyncWriter import conf import utils +import models -indexdir = "./var/indexdir" +indexdir = "./pyaggr3g470r/var/indexdir" schema = Schema(title=TEXT(stored=True), \ content=TEXT, \ @@ -49,19 +50,17 @@ def create_index(): """ Creates the index. """ - mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) - feeds = mongo.get_all_feeds() + feeds = models.Feed.objects() if not os.path.exists(indexdir): os.mkdir(indexdir) ix = create_in(indexdir, schema) writer = ix.writer() for feed in feeds: - for article in mongo.get_articles(feed["feed_id"]): - writer.add_document(title=article["article_title"], \ - content=utils.clear_string(article["article_content"]), \ - article_id=article["article_id"] , \ - feed_id=feed["feed_id"]) + for article in feed.articles: + writer.add_document(title=article.title, \ + content=utils.clear_string(article.content), \ + article_id=str(article.id).decode(), \ + feed_id=str(feed.oid).decode()) writer.commit() def add_to_index(articles, feed): @@ -73,13 +72,15 @@ def add_to_index(articles, feed): try: ix = open_dir(indexdir) except (EmptyIndexError, OSError) as e: - raise EmptyIndexError + if not os.path.exists(indexdir): + os.mkdir(indexdir) + ix = create_in(indexdir, schema) writer = AsyncWriter(ix) for article in articles: - writer.add_document(title=article["article_title"], \ - content=utils.clear_string(article["article_content"]), \ - article_id=article["article_id"] , \ - feed_id=feed["feed_id"]) + writer.add_document(title=article.title, \ + content=utils.clear_string(article.content), \ + article_id=str(article.id).decode(), \ + feed_id=str(feed.oid).decode()) writer.commit() def delete_article(feed_id, article_id): @@ -125,4 +126,4 @@ if __name__ == "__main__": print(nb_documents()) results = search("Nothomb") for article in results: - print(article) + print(article) \ No newline at end of file -- cgit