diff options
Diffstat (limited to 'source/feedgetter.py')
-rwxr-xr-x | source/feedgetter.py | 231 |
1 files changed, 0 insertions, 231 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py deleted file mode 100755 index ce1cba1b..00000000 --- a/source/feedgetter.py +++ /dev/null @@ -1,231 +0,0 @@ -#! /usr/bin/env python -#-*- coding: utf-8 -*- - -# pyAggr3g470r - A Web based news aggregator. -# Copyright (C) 2010-2013 Cédric Bonhomme - http://cedricbonhomme.org/ -# -# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/ -# -# This program is free software: you can redistribute it and/or modify -# it under the terms of the GNU General Public License as published by -# the Free Software Foundation, either version 3 of the License, or -# (at your option) any later version. -# -# This program is distributed in the hope that it will be useful, -# but WITHOUT ANY WARRANTY; without even the implied warranty of -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -# GNU General Public License for more details. -# -# You should have received a copy of the GNU General Public License -# along with this program. If not, see <http://www.gnu.org/licenses/> - -__author__ = "Cedric Bonhomme" -__version__ = "$Revision: 1.8 $" -__date__ = "$Date: 2010/09/02 $" -__revision__ = "$Date: 2013/08/15 $" -__copyright__ = "Copyright (c) Cedric Bonhomme" -__license__ = "GPLv3" - -import hashlib -import threading -import urllib.request -import feedparser -from bs4 import BeautifulSoup -from datetime import datetime -from contextlib import contextmanager - -import conf -import search -import utils -import mongodb - -import log -pyaggr3g470r_log = log.Log() - -list_of_threads = [] - -@contextmanager -def opened_w_error(filename, mode="r"): - try: - f = open(filename, mode) - except IOError as err: - yield None, err - else: - try: - yield f, None - finally: - f.close() - -class FeedGetter(object): - """ - This class is in charge of retrieving feeds listed in ./var/feed.lst. - This class uses feedparser module from Mark Pilgrim. - For each feed a new thread is launched. - """ - def __init__(self): - """ - Initializes the database connection. - """ - # MongoDB connections - self.articles = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \ - conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD) - if conf.HTTP_PROXY == "": - self.proxy = urllib.request.ProxyHandler({}) - else: - self.proxy = urllib.request.ProxyHandler({"http" : conf.HTTP_PROXY}) - feedparser.USER_AGENT = conf.USER_AGENT - - def retrieve_feed(self, feed_url=None, feed_original=None): - """ - Parse the file 'feeds.lst' and launch a thread for each RSS feed. - """ - if feed_url != None: - self.process(feed_url, feed_original) - else: - with opened_w_error(conf.FEED_LIST) as (f, err): - if err: - pyaggr3g470r_log.error("List of feeds not found.") - else: - for a_feed in f: - # test if the URL is well formed - for url_regexp in utils.url_finders: - if url_regexp.match(a_feed): - the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "") - try: - # launch a new thread for the RSS feed - thread = threading.Thread(None, self.process, \ - None, (the_good_url,)) - thread.start() - list_of_threads.append(thread) - except: - pass - break - - # wait for all threads are done - for th in list_of_threads: - th.join() - - def process(self, the_good_url, feed_original=None): - """Request the URL - - Executed in a thread. - """ - if utils.open_url(the_good_url)[0] == True: - # if ressource is available add the articles in the base. - self.add_into_database(the_good_url, feed_original) - - def add_into_database(self, feed_link, feed_original=None): - """ - Add the articles of the feed 'a_feed' in the database. - """ - a_feed = feedparser.parse(feed_link, handlers = [self.proxy]) - if a_feed['entries'] == []: - return - try: - feed_image = a_feed.feed.image.href - except: - feed_image = "/img/feed-icon-28x28.png" - - if feed_original != None: - feed_link = feed_original - - sha1_hash = hashlib.sha1() - sha1_hash.update(feed_link.encode('utf-8')) - feed_id = sha1_hash.hexdigest() - - feed = self.articles.get_feed(feed_id) - if None == feed: - collection_dic = {"feed_id": feed_id, \ - "type": 0, \ - "feed_image": feed_image, \ - "feed_title": utils.clear_string(a_feed.feed.title), \ - "feed_link": feed_link, \ - "site_link": a_feed.feed.link, \ - "mail": False \ - } - self.articles.add_collection(collection_dic) - feed = self.articles.get_feed(feed_id) - - articles = [] - for article in a_feed['entries']: - description = "" - article_title = "" - try: - # article content - description = article.content[0].value - except AttributeError: - try: - # article description - description = article.description - except Exception: - description = "" - try: - description = BeautifulSoup(description, "html.parser").decode() - article_title = BeautifulSoup(article.title, "html.parser").decode() - except Exception as E: - pyaggr3g470r_log.error("Problem when sanitizing the content of the feed: " + feed_link) - article_title = article.title - - try: - post_date = datetime(*article.published_parsed[:6]) - except: - post_date = datetime(*article.updated_parsed[:6]) - - sha1_hash = hashlib.sha1() - sha1_hash.update(article.link.encode('utf-8')) - article_id = sha1_hash.hexdigest() - - article = {"article_id": article_id, \ - "type":1, \ - "article_date": post_date, \ - "article_link": article.link, \ - "article_title": article_title, \ - "article_content": description, \ - "article_readed": False, \ - "article_like": False \ - } - - articles.append(article) - - if self.articles.get_articles(feed_id, article_id) == []: - # add the article to the Whoosh index - try: - search.add_to_index([article], feed) - except: - print("Whoosh error.") - pyaggr3g470r_log.error("Whoosh error.") - continue - - if conf.MAIL_ENABLED and feed["mail"]: - # if subscribed to the feed - threading.Thread(None, utils.send_mail, None, (conf.mail_from, conf.mail_to, \ - a_feed.feed.title, \ - article_title, description)).start() - self.articles.add_articles(articles, feed_id) - - -if __name__ == "__main__": - # Point of entry in execution mode - feed_getter = FeedGetter() - # Retrieve all feeds - feed_getter.retrieve_feed() - - # If you want to get all articles of a blog: - """ - for i in range(1,86): - feed_original = "http://esr.ibiblio.org/?feed=rss2" - feed = feed_original + "&paged=" + str(i) - print("Retrieving", feed, "...") - feed_getter.retrieve_feed(feed, feed_original) - """ - """ - for i in range(1,5): - feed_original = "http://spaf.wordpress.com/feed/" - feed = feed_original + "?paged=" + str(i) - print("Retrieving", feed, "...") - feed_getter.retrieve_feed(feed, feed_original) - """ - - # For a blogspot blog: - #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default") - #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default")
\ No newline at end of file |