diff options
-rw-r--r-- | fetch.py | 4 | ||||
-rw-r--r-- | pyaggr3g470r/crawler.py | 25 | ||||
-rw-r--r-- | pyaggr3g470r/feedgetter.py | 215 | ||||
-rw-r--r-- | pyaggr3g470r/views.py | 2 |
4 files changed, 28 insertions, 218 deletions
@@ -8,7 +8,7 @@ # to fetch articles every 30 minutes. import sys -from pyaggr3g470r import feedgetter +from pyaggr3g470r import crawler if __name__ == "__main__": # Point of entry in execution mode @@ -16,5 +16,5 @@ if __name__ == "__main__": feed_id = int(sys.argv[2]) except: feed_id = None - feed_getter = feedgetter.FeedGetter(sys.argv[1]) + feed_getter = crawler.FeedGetter(sys.argv[1]) feed_getter.retrieve_feed(feed_id) diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index 15849b9d..0393ae31 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -1,6 +1,31 @@ #! /usr/bin/env python # -*- coding: utf-8 - +# pyAggr3g470r - A Web based news aggregator. +# Copyright (C) 2010-2014 Cédric Bonhomme - http://cedricbonhomme.org/ +# +# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/ +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as +# published by the Free Software Foundation, either version 3 of the +# License, or (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. +# +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 2.1 $" +__date__ = "$Date: 2010/09/02 $" +__revision__ = "$Date: 2014/04/13 $" +__copyright__ = "Copyright (c) Cedric Bonhomme" +__license__ = "AGPLv3" + import feedparser import urllib2 import requests diff --git a/pyaggr3g470r/feedgetter.py b/pyaggr3g470r/feedgetter.py deleted file mode 100644 index b06c8eff..00000000 --- a/pyaggr3g470r/feedgetter.py +++ /dev/null @@ -1,215 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2014 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU Affero General Public License as -# published by the Free Software Foundation, either version 3 of the -# License, or (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU Affero General Public License for more details. -# -# You should have received a copy of the GNU Affero General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/>. - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 2.0 $" -__date__ = "$Date: 2010/09/02 $" -__revision__ = "$Date: 2013/11/10 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "AGPLv3" - -import urllib2 -import requests -import threading -import feedparser -from datetime import datetime -from BeautifulSoup import BeautifulSoup - -from requests.exceptions import Timeout - -from sqlalchemy.exc import IntegrityError - -import models -import conf -if not conf.ON_HEROKU: - import search -import utils - -if not conf.ON_HEROKU: - from flask.ext.mail import Message - from pyaggr3g470r import mail - -from pyaggr3g470r import app, db -from pyaggr3g470r.models import User, Feed, Article - -import log -pyaggr3g470r_log = log.Log("feedgetter") - -list_of_threads = [] - -class FeedGetter(object): - """ - This class is in charge of retrieving feeds listed in ./var/feed.lst. - This class uses feedparser module from Mark Pilgrim. - For each feed a new thread is launched. - """ - def __init__(self, email): - """ - Initializes the database connection. - """ - feedparser.USER_AGENT = conf.USER_AGENT - if conf.HTTP_PROXY == "": - self.proxy = urllib2.ProxyHandler({}) - self.proxies = {} - else: - self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \ - "https": conf.HTTP_PROXY}) - self.proxies = { - "http": "http://" + conf.HTTP_PROXY, - "https": "http://" + conf.HTTP_PROXY - } - feedparser.USER_AGENT = conf.USER_AGENT - self.user = User.query.filter(User.email == email).first() - - def retrieve_feed(self, feed_id=None): - """ - Parse the file 'feeds.lst' and launch a thread for each RSS feed. - """ - feeds = [feed for feed in self.user.feeds if feed.enabled] - if feed_id != None: - feeds = [feed for feed in feeds if feed.id == feed_id] - for current_feed in feeds: - try: - # launch a new thread for the RSS feed - thread = threading.Thread(None, self.process, \ - None, (current_feed, )) - thread.start() - list_of_threads.append(thread) - except: - pass - - # wait for all threads are done - for th in list_of_threads: - th.join() - - def process(self, feed): - """ - Retrieves articles form the feed and add them to the database. - """ - a_feed = feedparser.parse(feed.link, handlers = [self.proxy]) - if a_feed['entries'] == []: - return - - # Feed informations - if feed.title == "": - try: - feed.title = a_feed.feed.title - except: - feed.title = "" - if feed.description == "": - try: - feed.description = a_feed.feed.subtitle - except: - feed.description = "" - - articles = [] - for article in a_feed['entries']: - - nice_url = article.link.encode("utf-8") - if conf.RESOLVE_ARTICLE_URL: - try: - # resolves URL behind proxies (like feedproxy.google.com) - r = requests.get(article.link, timeout=5.0, proxies=self.proxies) - nice_url = r.url.encode("utf-8") - except Timeout: - pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,)) - continue - except Exception as e: - pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e))) - continue - # remove utm_* parameters - nice_url = utils.clean_url(nice_url) - - exist1 = Article.query.filter(Article.user_id == self.user.id, Article.link == nice_url).first() - exist2 = Article.query.filter(Article.user_id == self.user.id, Article.link == utils.clean_url(article.link.encode("utf-8"))).first() - if exist1 != None or exist2 != None: - continue - - description = "" - article_title = "" - try: - # article content - description = article.content[0].value - except AttributeError: - try: - # article description - description = article.description - except Exception: - description = "" - try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception as E: - #pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) - article_title = article.title - - try: - post_date = datetime(*article.published_parsed[:6]) - except: - post_date = datetime(*article.updated_parsed[:6]) - - # save the article - article = Article(link=nice_url, title=article_title, \ - content=description, readed=False, like=False, date=post_date, \ - user_id=self.user.id, feed_id=feed.id) - articles.append(article) - - # add the article to the Whoosh index only if we are not on Heroku - """ - if not conf.ON_HEROKU: - try: - search.add_to_index([article], feed) - except Exception as e: - pyaggr3g470r_log.error("Whoosh error.") - pass""" - - # email notification - if conf.MAIL_ENABLED and feed.email_notification: - with app.app_context(): - msg = Message('[pyAggr3g470r] ' + feed.title + ' : ' + article.title, \ - sender = conf.MAIL_FROM, recipients = [conf.MAIL_TO]) - msg.body = utils.clear_string(description) - msg.html = description - mail.send(msg) - - # add the articles to the list of articles for the current feed - for article in articles: - try: - feed.articles.append(article) - db.session.merge(article) - db.session.commit() - pyaggr3g470r_log.info("New article %s (%s) added." % (article_title, nice_url)) - except IntegrityError: - pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article_title, nice_url)) - db.session.rollback() - continue - except Exception as e: - pyaggr3g470r_log.error("Error when inserting article in database: " + str(e)) - continue - db.session.close() - return True - - -if __name__ == "__main__": - # Point of entry in execution mode - feed_getter = FeedGetter() - # Retrieve all feeds - feed_getter.retrieve_feed() diff --git a/pyaggr3g470r/views.py b/pyaggr3g470r/views.py index c6828b25..e88fe938 100644 --- a/pyaggr3g470r/views.py +++ b/pyaggr3g470r/views.py @@ -38,7 +38,7 @@ from werkzeug import generate_password_hash import conf import utils import export -import feedgetter, crawler +import crawler import models if not conf.ON_HEROKU: import search as fastsearch |