diff options
Diffstat (limited to 'source/feedgetter.py')
-rwxr-xr-x | source/feedgetter.py | 167 |
1 files changed, 167 insertions, 0 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py new file mode 100755 index 00000000..e3469132 --- /dev/null +++ b/source/feedgetter.py @@ -0,0 +1,167 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +# pyAggr3g470r - A Web based news aggregator. +# Copyright (C) 2010 Cédric Bonhomme - http://cedricbonhomme.org/ +# +# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/> + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 1.0 $" +__date__ = "$Date: 2010/09/02 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "GPLv3" + +import os.path +import traceback +import threading +import feedparser +import hashlib +from BeautifulSoup import BeautifulSoup + +from datetime import datetime + +import utils +import mongodb + +feeds_list = [] +list_of_threads = [] + + +class FeedGetter(object): + """ + This class is in charge of retrieving feeds listed in ./var/feed.lst. + This class uses feedparser module from Mark Pilgrim. + For each feed a new thread is launched. + """ + def __init__(self): + """ + Initializes the base and variables. + """ + # MongoDB connections + self.articles = mongodb.Articles() + + def retrieve_feed(self): + """ + Parse the file 'feeds.lst' and launch a thread for each RSS feed. + """ + with open("./var/feed.lst") as f: + for a_feed in f: + # test if the URL is well formed + for url_regexp in utils.url_finders: + if url_regexp.match(a_feed): + the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "") + try: + # launch a new thread for the RSS feed + thread = threading.Thread(None, self.process, \ + None, (the_good_url,)) + thread.start() + list_of_threads.append(thread) + except: + pass + break + + # wait for all threads are done + for th in list_of_threads: + th.join() + + def process(self, the_good_url): + """Request the URL + + Executed in a thread. + """ + if utils.detect_url_errors([the_good_url]) == []: + # if ressource is available add the articles in the base. + self.add_into_database(the_good_url) + + def add_into_database(self, feed_link): + """ + Add the articles of the feed 'a_feed' in the SQLite base. + """ + a_feed = feedparser.parse(feed_link) + if a_feed['entries'] == []: + return + try: + feed_image = a_feed.feed.image.href + except: + feed_image = "/img/feed-icon-28x28.png" + + sha1_hash = hashlib.sha1() + sha1_hash.update(feed_link.encode('utf-8')) + feed_id = sha1_hash.hexdigest() + + collection_dic = {"feed_id": feed_id, \ + "type": 0, \ + "feed_image": feed_image, \ + "feed_title": utils.clear_string(a_feed.feed.title.encode('utf-8')), \ + "feed_link": feed_link, \ + "site_link": a_feed.feed.link.encode('utf-8'), \ + "mail": False \ + } + + self.articles.add_collection(collection_dic) + + articles = [] + for article in a_feed['entries']: + description = "" + try: + # article content + description = article.content[0].value + except AttributeError: + try: + # article description + description = article.description + except Exception, e: + description = "" + description = str(BeautifulSoup(description)) + article_title = str(BeautifulSoup(article.title)) + + try: + post_date = datetime(*article.updated_parsed[:6]) + except: + post_date = datetime(*article.published_parsed[:6]) + + + sha1_hash = hashlib.sha1() + sha1_hash.update(article.link.encode('utf-8')) + article_id = sha1_hash.hexdigest() + + article = {"article_id": article_id, \ + "type":1, \ + "article_date": post_date, \ + "article_link": article.link.encode('utf-8'), \ + "article_title": article_title, \ + "article_content": description, \ + "article_readed": False, \ + "article_like": False \ + } + + articles.append(article) + + self.articles.add_articles(articles, feed_id) + + # send new articles by e-mail if desired. + #threading.Thread(None, utils.send_mail, None, (utils.mail_from, utils.mail_to, \ + #a_feed.feed.title.encode('utf-8'), \ + #article_title, description) \ + #).start() + + + +if __name__ == "__main__": + # Point of entry in execution mode + feed_getter = FeedGetter() + feed_getter.retrieve_feed() |