diff options
author | Cédric Bonhomme <kimble.mandel@gmail.com> | 2013-11-10 21:49:23 +0100 |
---|---|---|
committer | Cédric Bonhomme <kimble.mandel@gmail.com> | 2013-11-10 21:49:23 +0100 |
commit | 4c466b3af02063c96675b2fa3fe045b9030d8152 (patch) | |
tree | f99b68decb2e6b884fb8896b64de1956d5778ece /pyaggr3g470r | |
parent | Email notification. (diff) | |
download | newspipe-4c466b3af02063c96675b2fa3fe045b9030d8152.tar.gz newspipe-4c466b3af02063c96675b2fa3fe045b9030d8152.tar.bz2 newspipe-4c466b3af02063c96675b2fa3fe045b9030d8152.zip |
Whoosh indexing.
Diffstat (limited to 'pyaggr3g470r')
-rw-r--r-- | pyaggr3g470r/feedgetter.py | 22 | ||||
-rw-r--r-- | pyaggr3g470r/search.py | 35 |
2 files changed, 28 insertions, 29 deletions
diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py index 850f4449..21cc841d 100644 --- a/pyaggr3g470r/feedgetter.py +++ b/pyaggr3g470r/feedgetter.py @@ -20,19 +20,17 @@ # along with this program. If not, see <http://www.gnu.org/licenses/> __author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.9 $" +__version__ = "$Revision: 2.0 $" __date__ = "$Date: 2010/09/02 $" -__revision__ = "$Date: 2013/11/05 $" +__revision__ = "$Date: 2013/11/10 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" -import hashlib import threading import feedparser from BeautifulSoup import BeautifulSoup from datetime import datetime -from contextlib import contextmanager import models import conf @@ -81,7 +79,7 @@ class FeedGetter(object): def process(self, feed): """ - Comment + Retrieves articles form the feed and add them to the database. """ #a_feed = feedparser.parse(feed_link, handlers = [self.proxy]) a_feed = feedparser.parse(feed.link) @@ -118,21 +116,21 @@ class FeedGetter(object): except: post_date = datetime(*article.updated_parsed[:6]) + # save the article article = models.Article(post_date, article.link, article_title, description, False, False) article.save() articles.append(article) - """ # add the article to the Whoosh index try: search.add_to_index([article], feed) - except: - print("Whoosh error.") + except Exception as e: + print("Whoosh error: " + str(e)) #pyaggr3g470r_log.error("Whoosh error.") - continue""" + continue + # email notification if conf.MAIL_ENABLED and feed.email_notification: - # if subscribed to the feed with app.app_context(): msg = Message('[pyAggr3g470r] ' + feed.title + ' : ' + article.title, \ sender = conf.MAIL_FROM, recipients = [conf.MAIL_TO]) @@ -140,9 +138,9 @@ class FeedGetter(object): msg.html = description mail.send(msg) + # add the articles to the list of articles for the current feed feed.articles.extend(articles) feed.articles = sorted(feed.articles, key=lambda t: t.date, reverse=True) - #feed.save() self.user.save() @@ -150,4 +148,4 @@ if __name__ == "__main__": # Point of entry in execution mode feed_getter = FeedGetter() # Retrieve all feeds - feed_getter.retrieve_feed() + feed_getter.retrieve_feed()
\ No newline at end of file diff --git a/pyaggr3g470r/search.py b/pyaggr3g470r/search.py index 0b4d33b6..afb1b6ab 100644 --- a/pyaggr3g470r/search.py +++ b/pyaggr3g470r/search.py @@ -20,9 +20,9 @@ # along with this program. If not, see <http://www.gnu.org/licenses/> __author__ = "Cedric Bonhomme" -__version__ = "$Revision: 0.2 $" +__version__ = "$Revision: 0.3 $" __date__ = "$Date: 2013/06/24 $" -__revision__ = "$Date: 2013/06/25 $" +__revision__ = "$Date: 2013/11/10 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" @@ -37,8 +37,9 @@ from whoosh.writing import AsyncWriter import conf import utils +import models -indexdir = "./var/indexdir" +indexdir = "./pyaggr3g470r/var/indexdir" schema = Schema(title=TEXT(stored=True), \ content=TEXT, \ @@ -49,19 +50,17 @@ def create_index(): """ Creates the index. """ - mongo = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) - feeds = mongo.get_all_feeds() + feeds = models.Feed.objects() if not os.path.exists(indexdir): os.mkdir(indexdir) ix = create_in(indexdir, schema) writer = ix.writer() for feed in feeds: - for article in mongo.get_articles(feed["feed_id"]): - writer.add_document(title=article["article_title"], \ - content=utils.clear_string(article["article_content"]), \ - article_id=article["article_id"] , \ - feed_id=feed["feed_id"]) + for article in feed.articles: + writer.add_document(title=article.title, \ + content=utils.clear_string(article.content), \ + article_id=str(article.id).decode(), \ + feed_id=str(feed.oid).decode()) writer.commit() def add_to_index(articles, feed): @@ -73,13 +72,15 @@ def add_to_index(articles, feed): try: ix = open_dir(indexdir) except (EmptyIndexError, OSError) as e: - raise EmptyIndexError + if not os.path.exists(indexdir): + os.mkdir(indexdir) + ix = create_in(indexdir, schema) writer = AsyncWriter(ix) for article in articles: - writer.add_document(title=article["article_title"], \ - content=utils.clear_string(article["article_content"]), \ - article_id=article["article_id"] , \ - feed_id=feed["feed_id"]) + writer.add_document(title=article.title, \ + content=utils.clear_string(article.content), \ + article_id=str(article.id).decode(), \ + feed_id=str(feed.oid).decode()) writer.commit() def delete_article(feed_id, article_id): @@ -125,4 +126,4 @@ if __name__ == "__main__": print(nb_documents()) results = search("Nothomb") for article in results: - print(article) + print(article)
\ No newline at end of file |