#! /usr/bin/env python #-*- coding: utf-8 -*- # pyAggr3g470r - A Web based news aggregator. # Copyright (C) 2010-2014 Cédric Bonhomme - http://cedricbonhomme.org/ # # For more information : https://bitbucket.org/cedricbonhomme/pyaggr3g470r/ # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as # published by the Free Software Foundation, either version 3 of the # License, or (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . __author__ = "Cedric Bonhomme" __version__ = "$Revision: 2.0 $" __date__ = "$Date: 2010/09/02 $" __revision__ = "$Date: 2013/11/10 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "AGPLv3" import urllib2 import requests import threading import feedparser from datetime import datetime from BeautifulSoup import BeautifulSoup from requests.exceptions import Timeout from sqlalchemy.exc import IntegrityError import models import conf #import search import utils if not conf.ON_HEROKU: from flask.ext.mail import Message from pyaggr3g470r import mail from pyaggr3g470r import app, db from pyaggr3g470r.models import User, Feed, Article import log pyaggr3g470r_log = log.Log("feedgetter") list_of_threads = [] class FeedGetter(object): """ This class is in charge of retrieving feeds listed in ./var/feed.lst. This class uses feedparser module from Mark Pilgrim. For each feed a new thread is launched. """ def __init__(self, email): """ Initializes the database connection. """ feedparser.USER_AGENT = conf.USER_AGENT if conf.HTTP_PROXY == "": self.proxy = urllib2.ProxyHandler({}) self.proxies = {} else: self.proxy = urllib2.ProxyHandler({"http" : conf.HTTP_PROXY, \ "https": conf.HTTP_PROXY}) self.proxies = { "http": "http://" + conf.HTTP_PROXY, "https": "http://" + conf.HTTP_PROXY } feedparser.USER_AGENT = conf.USER_AGENT self.user = User.query.filter(User.email == email).first() def retrieve_feed(self, feed_id=None): """ Parse the file 'feeds.lst' and launch a thread for each RSS feed. """ feeds = [feed for feed in self.user.feeds if feed.enabled] if feed_id != None: feeds = [feed for feed in feeds if str(feed.id) == feed_id] for current_feed in feeds: try: # launch a new thread for the RSS feed thread = threading.Thread(None, self.process, \ None, (current_feed, )) thread.start() list_of_threads.append(thread) except: pass # wait for all threads are done for th in list_of_threads: th.join() def process(self, feed): """ Retrieves articles form the feed and add them to the database. """ a_feed = feedparser.parse(feed.link, handlers = [self.proxy]) if a_feed['entries'] == []: return # Feed informations if feed.title == "": try: feed.title = a_feed.feed.title except: feed.title = "" if feed.description == "": try: feed.description = a_feed.feed.subtitle except: feed.description = "" articles = [] for article in a_feed['entries']: nice_url = article.link.encode("utf-8") if conf.RESOLVE_ARTICLE_URL: try: # resolves URL behind proxies (like feedproxy.google.com) r = requests.get(article.link, timeout=5.0, proxies=self.proxies) nice_url = r.url.encode("utf-8") except Timeout: pyaggr3g470r_log.warning("Timeout when getting the real URL of %s." % (article.link,)) continue except Exception as e: pyaggr3g470r_log.warning("Unable to get the real URL of %s. Error: %s" % (article.link, str(e))) continue # remove utm_* parameters nice_url = utils.clean_url(nice_url) exist1 = Article.query.filter(Article.user_id == self.user.id, Article.link == nice_url).first() exist2 = Article.query.filter(Article.user_id == self.user.id, Article.link == utils.clean_url(article.link.encode("utf-8"))).first() if exist1 != None or exist2 != None: continue description = "" article_title = "" try: # article content description = article.content[0].value except AttributeError: try: # article description description = article.description except Exception: description = "" try: description = BeautifulSoup(description, "html.parser").decode() article_title = BeautifulSoup(article.title, "html.parser").decode() except Exception as E: #pyaggr3g470r_log.error("Problem when sanitizing the content of the article %s (%s)" % (article_title, nice_url)) article_title = article.title try: post_date = datetime(*article.published_parsed[:6]) except: post_date = datetime(*article.updated_parsed[:6]) # save the article article = Article(link=nice_url, title=article_title, \ content=description, readed=False, like=False, date=post_date, \ user_id=self.user.id, feed_id=feed.id) articles.append(article) # add the article to the Whoosh index """ try: search.add_to_index([article], feed) except Exception as e: pyaggr3g470r_log.error("Whoosh error.") pass""" # email notification if conf.MAIL_ENABLED and feed.email_notification: with app.app_context(): msg = Message('[pyAggr3g470r] ' + feed.title + ' : ' + article.title, \ sender = conf.MAIL_FROM, recipients = [conf.MAIL_TO]) msg.body = utils.clear_string(description) msg.html = description mail.send(msg) # add the articles to the list of articles for the current feed for article in articles: try: feed.articles.append(article) db.session.merge(article) db.session.commit() pyaggr3g470r_log.info("New article %s (%s) added." % (article_title, nice_url)) except IntegrityError: pyaggr3g470r_log.error("Article %s (%s) already in the database." % (article_title, nice_url)) db.session.rollback() continue except Exception as e: pyaggr3g470r_log.error("Error when inserting article in database: " + str(e)) continue db.session.close() return True if __name__ == "__main__": # Point of entry in execution mode feed_getter = FeedGetter() # Retrieve all feeds feed_getter.retrieve_feed()