From 446cf53d8fef63a0c4503f8389c5c26a41d3875f Mon Sep 17 00:00:00 2001 From: cedric Date: Fri, 29 Jan 2010 21:09:46 +0100 Subject: Initial import --- feedgetter.py | 85 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) create mode 100644 feedgetter.py (limited to 'feedgetter.py') diff --git a/feedgetter.py b/feedgetter.py new file mode 100644 index 00000000..ed0ec132 --- /dev/null +++ b/feedgetter.py @@ -0,0 +1,85 @@ +#! /usr/local/bin/python +#-*- coding: utf-8 -*- + +__author__ = "Cedric Bonhomme" +__version__ = "$Revision: 0.1 $" +__date__ = "$Date: 2010/29/01 $" +__copyright__ = "Copyright (c) 2010 Cedric Bonhomme" +__license__ = "GPLv3" + +import re +import sqlite3 +import threading +import feedparser + +url_finders = [ \ + re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \ + re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \ + re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \ + re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \ +] + +feeds_list = [] +list_of_threads = [] + + + + +def process(the_good_url): + """Request the URL + + Executed in a thread. + """ + feeds_list.append(feedparser.parse(the_good_url)) + +def retrieve_feed(): + """ + Parse the file 'feeds.lst' and launch a thread for each RSS feed. + """ + conn = sqlite3.connect("feed.db", isolation_level = None) + c = conn.cursor() + c.execute('''create table rss_feed + (date text, feed_title text, feed_site_link text, article_title text , article_link text)''') + + for a_feed in feeds_file.readlines(): + # test if the URL is well formed + for url_regexp in url_finders: + if url_regexp.match(a_feed): + the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "") + try: + # launch a new thread for the RSS feed + thread = threading.Thread(None, process, \ + None, (the_good_url,)) + thread.start() + list_of_threads.append(thread) + except: + pass + break + + # wait for all threads are done + for th in list_of_threads: + th.join() + + # when all jobs are done, insert articles in the base + for a_feed in feeds_list: + for article in a_feed['entries']: + c.execute('insert into rss_feed values (?,?,?,?,?)', (\ + "-".join([str(i) for i in list(article.updated_parsed)]), \ + a_feed.feed.title.encode('utf-8'), \ + a_feed.feed.link.encode('utf-8'), \ + article.title.encode('utf-8'), \ + article.link.encode('utf-8'))) + + conn.commit() + c.close() + + +if __name__ == "__main__": + # Point of entry in execution mode + try: + feeds_file = open("./var/feed.lst") + except: + print "./feed.lst not found" + exit(0) + + retrieve_feed() \ No newline at end of file -- cgit