4 files changed, 28 insertions, 218 deletions
diff --git a/fetch.py b/fetch.py
index 1bf10239..e65fc1b1 100644
--- a/fetch.py
+++ b/fetch.py
@@ -8,7 +8,7 @@
 # to fetch articles every 30 minutes.
 
 import sys
-from pyaggr3g470r import feedgetter
+from pyaggr3g470r import crawler
 
 if __name__ == "__main__":
     # Point of entry in execution mode
@@ -16,5 +16,5 @@ if __name__ == "__main__":
         feed_id = int(sys.argv[2])
     except:
         feed_id = None
-    feed_getter = feedgetter.FeedGetter(sys.argv[1])
+    feed_getter = crawler.FeedGetter(sys.argv[1])
     feed_getter.retrieve_feed(feed_id)
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 15849b9d..0393ae31 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -1,6 +1,31 @@
 #! /usr/bin/env python
 # -*- coding: utf-8 -
 
+# pyAggr3g470r - A Web based news aggregator.
+# Copyright (C) 2010-2014  Cédric Bonhomme - http://cedricbonhomme.org/
+#
+# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as
+# published by the Free Software Foundation, either version 3 of the
+# License, or (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+#
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+__author__ = "Cedric Bonhomme"
+__version__ = "$Revision: 2.1 $"
+__date__ = "$Date: 2010/09/02 $"
+__revision__ = "$Date: 2014/04/13 $"
+__copyright__ = "Copyright (c) Cedric Bonhomme"
+__license__ = "AGPLv3"
+
 import feedparser
 import urllib2
 import requests
diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py
deleted file mode 100644
index b06c8eff..00000000
--- a/pyaggr3g470r/feedgetter.py
+++ /dev/null
@@ -1,215 +0,0 @@
-#! /usr/bin/env python
-#-*- coding: utf-8 -*-
-
-# pyAggr3g470r - A Web based news aggregator.
-# Copyright (C) 2010-2014  Cédric Bonhomme - http://cedricbonhomme.org/
-#
-# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU Affero General Public License as
-# published by the Free Software Foundation, either version 3 of the
-# License, or (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU Affero General Public License for more details.
-#
-# You should have received a copy of the GNU Affero General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-__author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 2.0 $"
-__date__ = "$Date: 2010/09/02 $"
-__revision__ = "$Date: 2013/11/10 $"
-__copyright__ = "Copyright (c) Cedric Bonhomme"
-__license__ = "AGPLv3"
-
-import urllib2
-import requests
-import threading
-import feedparser
-from datetime import datetime
-from BeautifulSoup import BeautifulSoup
-
-from requests.exceptions import Timeout
-
-from sqlalchemy.exc import IntegrityError
-
-import models
-import conf
-if not conf.ON_HEROKU:
-    import search
-import utils
-
-if not conf.ON_HEROKU:
-    from flask.ext.mail import Message
-    from pyaggr3g470r import mail
-    
-from pyaggr3g470r import app, db
-from pyaggr3g470r.models import User, Feed, Article
-
-import log
-pyaggr3g470r_log = log.Log("feedgetter")
-
-list_of_threads = []
-
-class FeedGetter(object):
-    """
-    This class is in charge of retrieving feeds listed in ./var/feed.lst.
-    This class uses feedparser module from Mark Pilgrim.
-    For each feed a new thread is launched.
-    """
-    def __init__(self, email):
-        """
-        Initializes the database connection.
-        """
-        feedparser.USER_AGENT = conf.USER_AGENT
-        if conf.HTTP_PROXY == "":
-            self.proxy = urllib2.ProxyHandler({})
-            self.proxies = {}
-        else:
-            self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \
-                                               "https": conf.HTTP_PROXY})
-            self.proxies = {
-                            "http": "http://" + conf.HTTP_PROXY,
-                            "https": "http://" + conf.HTTP_PROXY
-                           }
-        feedparser.USER_AGENT = conf.USER_AGENT
-        self.user = User.query.filter(User.email == email).first()
-
-    def retrieve_feed(self, feed_id=None):
-        """
-        Parse the file 'feeds.lst' and launch a thread for each RSS feed.
-        """
-        feeds = [feed for feed in self.user.feeds if feed.enabled]
-        if feed_id != None:
-            feeds = [feed for feed in feeds if feed.id == feed_id]
-        for current_feed in feeds:
-            try:
-                # launch a new thread for the RSS feed
-                thread = threading.Thread(None, self.process, \
-                                           None, (current_feed, ))
-                thread.start()
-                list_of_threads.append(thread)
-            except:
-                pass
-
-        # wait for all threads are done
-        for th in list_of_threads:
-            th.join()
-
-    def process(self, feed):
-        """
-        Retrieves articles form the feed and add them to the database.
-        """
-        a_feed = feedparser.parse(feed.link, handlers = [self.proxy])
-        if a_feed['entries'] == []:
-            return
-
-        # Feed informations
-        if feed.title == "":
-            try:
-                feed.title = a_feed.feed.title
-            except:
-                feed.title = ""
-        if feed.description == "":
-            try:
-                feed.description = a_feed.feed.subtitle
-            except:
-                feed.description = ""
-
-        articles = []
-        for article in a_feed['entries']:
-
-            nice_url = article.link.encode("utf-8")
-            if conf.RESOLVE_ARTICLE_URL:
-                try:
-                    # resolves URL behind proxies (like feedproxy.google.com)
-                    r = requests.get(article.link, timeout=5.0, proxies=self.proxies)
-                    nice_url = r.url.encode("utf-8")
-                except Timeout:
-                    pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,))
-                    continue
-                except Exception as e:
-                    pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e)))
-                    continue
-            # remove utm_* parameters
-            nice_url = utils.clean_url(nice_url)
-
-            exist1 = Article.query.filter(Article.user_id == self.user.id, Article.link == nice_url).first()
-            exist2 = Article.query.filter(Article.user_id == self.user.id, Article.link == utils.clean_url(article.link.encode("utf-8"))).first()
-            if exist1 != None or exist2 != None:
-                continue
-
-            description = ""
-            article_title = ""
-            try:
-                # article content
-                description = article.content[0].value
-            except AttributeError:
-                try:
-                    # article description
-                    description = article.description
-                except Exception:
-                    description = ""
-            try:
-                description = BeautifulSoup(description, "html.parser").decode()
-                article_title = BeautifulSoup(article.title, "html.parser").decode()
-            except Exception as E:
-                #pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url))
-                article_title = article.title
-
-            try:
-                post_date = datetime(*article.published_parsed[:6])
-            except:
-                post_date = datetime(*article.updated_parsed[:6])
-
-            # save the article
-            article = Article(link=nice_url, title=article_title, \
-                              content=description, readed=False, like=False, date=post_date, \
-                              user_id=self.user.id, feed_id=feed.id)
-            articles.append(article)
-
-            # add the article to the Whoosh index only if we are not on Heroku
-            """
-            if not conf.ON_HEROKU:
-                try:
-                    search.add_to_index([article], feed)
-                except Exception as e:
-                    pyaggr3g470r_log.error("Whoosh error.")
-                    pass"""
-
-            # email notification
-            if conf.MAIL_ENABLED and feed.email_notification:
-                with app.app_context():
-                    msg = Message('[pyAggr3g470r] ' + feed.title + ' : ' + article.title, \
-                                sender = conf.MAIL_FROM, recipients = [conf.MAIL_TO])
-                    msg.body = utils.clear_string(description)
-                    msg.html = description
-                    mail.send(msg)
-
-        # add the articles to the list of articles for the current feed
-        for article in articles:
-            try:
-                feed.articles.append(article)
-                db.session.merge(article)
-                db.session.commit()
-                pyaggr3g470r_log.info("New article %s (%s) added." % (article_title, nice_url))
-            except IntegrityError:
-                pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article_title, nice_url))
-                db.session.rollback()
-                continue
-            except Exception as e:
-                pyaggr3g470r_log.error("Error when inserting article in database: " + str(e))
-                continue
-        db.session.close()
-        return True
-
-
-if __name__ == "__main__":
-    # Point of entry in execution mode
-    feed_getter = FeedGetter()
-    # Retrieve all feeds
-    feed_getter.retrieve_feed()
diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py
index c6828b25..e88fe938 100644
--- a/pyaggr3g470r/views.py
+++ b/pyaggr3g470r/views.py
@@ -38,7 +38,7 @@ from werkzeug import generate_password_hash
 import conf
 import utils
 import export
-import feedgetter, crawler
+import crawler
 import models
 if not conf.ON_HEROKU:
     import search as fastsearch