1 files changed, 167 insertions, 0 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py
new file mode 100755
index 00000000..e3469132
--- /dev/null
+++ b/source/feedgetter.py
@@ -0,0 +1,167 @@
+#! /usr/bin/env python
+#-*- coding: utf-8 -*-
+
+# pyAggr3g470r - A Web based news aggregator.
+# Copyright (C) 2010  Cédric Bonhomme - http://cedricbonhomme.org/
+#
+# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 1.0 $"
+__date__ = "$Date: 2010/09/02 $"
+__copyright__ = "Copyright (c) Cedric Bonhomme"
+__license__ = "GPLv3"
+
+import os.path
+import traceback
+import threading
+import feedparser
+import hashlib
+from BeautifulSoup import BeautifulSoup
+
+from datetime import datetime
+
+import utils
+import mongodb
+
+feeds_list = []
+list_of_threads = []
+
+
+class FeedGetter(object):
+    """
+    This class is in charge of retrieving feeds listed in ./var/feed.lst.
+    This class uses feedparser module from Mark Pilgrim.
+    For each feed a new thread is launched.
+    """
+    def __init__(self):
+        """
+        Initializes the base and variables.
+        """
+        # MongoDB connections
+        self.articles = mongodb.Articles()
+
+    def retrieve_feed(self):
+        """
+        Parse the file 'feeds.lst' and launch a thread for each RSS feed.
+        """
+        with open("./var/feed.lst") as f:
+            for a_feed in f:
+                # test if the URL is well formed
+                for url_regexp in utils.url_finders:
+                    if url_regexp.match(a_feed):
+                        the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "")
+                        try:
+                            # launch a new thread for the RSS feed
+                            thread = threading.Thread(None, self.process, \
+                                                None, (the_good_url,))
+                            thread.start()
+                            list_of_threads.append(thread)
+                        except:
+                            pass
+                        break
+
+        # wait for all threads are done
+        for th in list_of_threads:
+            th.join()
+
+    def process(self, the_good_url):
+        """Request the URL
+
+        Executed in a thread.
+        """
+        if utils.detect_url_errors([the_good_url]) == []:
+            # if ressource is available add the articles in the base.
+            self.add_into_database(the_good_url)
+
+    def add_into_database(self, feed_link):
+        """
+        Add the articles of the feed 'a_feed' in the SQLite base.
+        """
+        a_feed = feedparser.parse(feed_link)
+        if a_feed['entries'] == []:
+            return
+        try:
+            feed_image = a_feed.feed.image.href
+        except:
+            feed_image = "/img/feed-icon-28x28.png"
+
+        sha1_hash = hashlib.sha1()
+        sha1_hash.update(feed_link.encode('utf-8'))
+        feed_id = sha1_hash.hexdigest()
+            
+        collection_dic = {"feed_id": feed_id, \
+                            "type": 0, \
+                            "feed_image": feed_image, \
+                            "feed_title": utils.clear_string(a_feed.feed.title.encode('utf-8')), \
+                            "feed_link": feed_link, \
+                            "site_link": a_feed.feed.link.encode('utf-8'), \
+                            "mail": False \
+                        }
+            
+        self.articles.add_collection(collection_dic)
+
+        articles = []
+        for article in a_feed['entries']:
+            description = ""
+            try:
+                # article content
+                description = article.content[0].value
+            except AttributeError:
+                try:
+                    # article description
+                    description = article.description
+                except Exception, e:
+                    description = ""
+            description = str(BeautifulSoup(description))
+            article_title = str(BeautifulSoup(article.title))
+
+            try:
+                post_date = datetime(*article.updated_parsed[:6])
+            except:
+                post_date = datetime(*article.published_parsed[:6])
+
+
+            sha1_hash = hashlib.sha1()
+            sha1_hash.update(article.link.encode('utf-8'))
+            article_id = sha1_hash.hexdigest()
+
+            article = {"article_id": article_id, \
+                    "type":1, \
+                    "article_date": post_date, \
+                    "article_link": article.link.encode('utf-8'), \
+                    "article_title": article_title, \
+                    "article_content": description, \
+                    "article_readed": False, \
+                    "article_like": False \
+                    }
+                
+            articles.append(article)
+
+        self.articles.add_articles(articles, feed_id)
+
+        # send new articles by e-mail if desired.
+        #threading.Thread(None, utils.send_mail, None, (utils.mail_from, utils.mail_to, \
+                            #a_feed.feed.title.encode('utf-8'), \
+                            #article_title, description) \
+                        #).start()
+
+
+
+if __name__ == "__main__":
+    # Point of entry in execution mode
+    feed_getter = FeedGetter()
+    feed_getter.retrieve_feed()