#! /usr/bin/env python #-*- coding: utf-8 -*- # pyAggr3g470r - A Web based news aggregator. # Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ # # For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program. If not, see __author__ = "Cedric Bonhomme" __version__ = "$Revision: 1.4 $" __date__ = "$Date: 2010/09/02 $" __revision__ = "$Date: 2013/01/20 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" import hashlib import threading import feedparser from bs4 import BeautifulSoup from datetime import datetime import conf import utils import mongodb list_of_threads = [] class FeedGetter(object): """ This class is in charge of retrieving feeds listed in ./var/feed.lst. This class uses feedparser module from Mark Pilgrim. For each feed a new thread is launched. """ def __init__(self): """ Initializes the database connection. """ # MongoDB connections self.articles = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) def retrieve_feed(self, feed_url=None, feed_original=None): """ Parse the file 'feeds.lst' and launch a thread for each RSS feed. """ if feed_url != None: self.process(feed_url, feed_original) else: with open(conf.FEED_LIST) as f: for a_feed in f: # test if the URL is well formed for url_regexp in utils.url_finders: if url_regexp.match(a_feed): the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "") try: # launch a new thread for the RSS feed thread = threading.Thread(None, self.process, \ None, (the_good_url,)) thread.start() list_of_threads.append(thread) except: pass break # wait for all threads are done for th in list_of_threads: th.join() def process(self, the_good_url, feed_original=None): """Request the URL Executed in a thread. """ if utils.detect_url_errors([the_good_url]) == []: # if ressource is available add the articles in the base. self.add_into_database(the_good_url, feed_original) def add_into_database(self, feed_link, feed_original=None): """ Add the articles of the feed 'a_feed' in the SQLite base. """ a_feed = feedparser.parse(feed_link) if a_feed['entries'] == []: return try: feed_image = a_feed.feed.image.href except: feed_image = "/img/feed-icon-28x28.png" if feed_original != None: feed_link = feed_original sha1_hash = hashlib.sha1() sha1_hash.update(feed_link.encode('utf-8')) feed_id = sha1_hash.hexdigest() feed = self.articles.get_feed(feed_id) if None == feed: collection_dic = {"feed_id": feed_id, \ "type": 0, \ "feed_image": feed_image, \ "feed_title": utils.clear_string(a_feed.feed.title), \ "feed_link": feed_link, \ "site_link": a_feed.feed.link, \ "mail": False \ } self.articles.add_collection(collection_dic) articles = [] for article in a_feed['entries']: description = "" try: # article content description = article.content[0].value except AttributeError: try: # article description description = article.description except Exception: description = "" description = str(BeautifulSoup(description)) article_title = str(BeautifulSoup(article.title)) try: post_date = datetime(*article.published_parsed[:6]) except: post_date = datetime(*article.updated_parsed[:6]) sha1_hash = hashlib.sha1() sha1_hash.update(article.link.encode('utf-8')) article_id = sha1_hash.hexdigest() article = {"article_id": article_id, \ "type":1, \ "article_date": post_date, \ "article_link": article.link, \ "article_title": article_title, \ "article_content": description, \ "article_readed": False, \ "article_like": False \ } articles.append(article) if conf.MAIL_ENABLED and feed["mail"] and self.articles.get_articles(feed_id, article_id) == False: # if subscribed to the feed AND if article not already in the database threading.Thread(None, utils.send_mail, None, (conf.mail_from, conf.mail_to, \ a_feed.feed.title, \ article_title, description)).start() self.articles.add_articles(articles, feed_id) if __name__ == "__main__": # Point of entry in execution mode feed_getter = FeedGetter() # Retrieve all feeds feed_getter.retrieve_feed() # If you want to get all articles of a blog: """ for i in range(1,86): feed_original = "http://esr.ibiblio.org/?feed=rss2" feed = feed_original + "&paged=" + str(i) print("Retrieving", feed, "...") feed_getter.retrieve_feed(feed, feed_original) """ """ for i in range(1,5): feed_original = "http://spaf.wordpress.com/feed/" feed = feed_original + "?paged=" + str(i) print("Retrieving", feed, "...") feed_getter.retrieve_feed(feed, feed_original) """ # For a blogspot blog: #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default") #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")