From abec11e7ca0ce49081343bb2b2eb8520058d67a8 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Sun, 13 Oct 2013 10:07:51 +0200 Subject: Added new files. First prototype with the Flask micro-framework. --- pyaggr3g470r/feedgetter.py | 159 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) create mode 100644 pyaggr3g470r/feedgetter.py (limited to 'pyaggr3g470r/feedgetter.py') diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py new file mode 100644 index 00000000..487fa3b0 --- /dev/null +++ b/pyaggr3g470r/feedgetter.py @@ -0,0 +1,159 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +# pyAggr3g470r - A Web based news aggregator. +# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ +# +# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 1.8 $" +__date__ = "$Date: 2010/09/02 $" +__revision__ = "$Date: 2013/08/15 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "GPLv3" + +import hashlib +import threading + +import feedparser +from BeautifulSoup import BeautifulSoup +from datetime import datetime +from contextlib import contextmanager + +import models +import conf +import search +import utils + +import log +pyaggr3g470r_log = log.Log() + +list_of_threads = [] + +@contextmanager +def opened_w_error(filename, mode="r"): + try: + f = open(filename, mode) + except IOError as err: + yield None, err + else: + try: + yield f, None + finally: + f.close() + +class FeedGetter(object): + """ + This class is in charge of retrieving feeds listed in ./var/feed.lst. + This class uses feedparser module from Mark Pilgrim. + For each feed a new thread is launched. + """ + def __init__(self): + """ + Initializes the database connection. + """ + #feedparser.USER_AGENT = conf.USER_AGENT + feedparser.USER_AGENT = "pyAggr3g470r" + + def retrieve_feed(self): + """ + Parse the file 'feeds.lst' and launch a thread for each RSS feed. + """ + feeds = models.Feed.objects() + for feed in feeds: + try: + # launch a new thread for the RSS feed + thread = threading.Thread(None, self.process, \ + None, (feed,)) + thread.start() + list_of_threads.append(thread) + except: + pass + + # wait for all threads are done + for th in list_of_threads: + th.join() + + def process(self, feed): + """ + Comment + """ + #a_feed = feedparser.parse(feed_link, handlers = [self.proxy]) + a_feed = feedparser.parse(feed.link) + if a_feed['entries'] == []: + return + + articles = [] + for article in a_feed['entries']: + + exist = models.Article.objects(link=article.link).first() + if exist != None: + print("Already downloaded!") + continue + + description = "" + article_title = "" + try: + # article content + description = article.content[0].value + except AttributeError: + try: + # article description + description = article.description + except Exception: + description = "" + try: + description = BeautifulSoup(description, "html.parser").decode() + article_title = BeautifulSoup(article.title, "html.parser").decode() + except Exception as E: + pyaggr3g470r_log.error("Problem when sanitizing the content of the feed: " + feed.link) + article_title = article.title + + try: + post_date = datetime(*article.published_parsed[:6]) + except: + post_date = datetime(*article.updated_parsed[:6]) + + article = models.Article(post_date, article.link, article_title, description, False, False) + article.save() + articles.append(article) + + """ + if self.articles.get_articles(feed_id, article_id) == []: + # add the article to the Whoosh index + try: + search.add_to_index([article], feed) + except: + print("Whoosh error.") + pyaggr3g470r_log.error("Whoosh error.") + continue + + if conf.MAIL_ENABLED and feed["mail"]: + # if subscribed to the feed + threading.Thread(None, utils.send_mail, None, (conf.mail_from, conf.mail_to, \ + a_feed.feed.title, \ + article_title, description)).start() + """ + feed.articles.extend(articles) + feed.save() + + +if __name__ == "__main__": + # Point of entry in execution mode + feed_getter = FeedGetter() + # Retrieve all feeds + feed_getter.retrieve_feed() \ No newline at end of file -- cgit