aboutsummaryrefslogtreecommitdiff
path: root/feedgetter.py
diff options
context:
space:
mode:
Diffstat (limited to 'feedgetter.py')
-rwxr-xr-xfeedgetter.py167
1 files changed, 0 insertions, 167 deletions
diff --git a/feedgetter.py b/feedgetter.py
deleted file mode 100755
index e3469132..00000000
--- a/feedgetter.py
+++ /dev/null
@@ -1,167 +0,0 @@
-#! /usr/bin/env python
-#-*- coding: utf-8 -*-
-
-# pyAggr3g470r - A Web based news aggregator.
-# Copyright (C) 2010 Cédric Bonhomme - http://cedricbonhomme.org/
-#
-# For more information : http://bitbucket.org/cedricbonhomme/pyaggr3g470r/
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program. If not, see <http://www.gnu.org/licenses/>
-
-__author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 1.0 $"
-__date__ = "$Date: 2010/09/02 $"
-__copyright__ = "Copyright (c) Cedric Bonhomme"
-__license__ = "GPLv3"
-
-import os.path
-import traceback
-import threading
-import feedparser
-import hashlib
-from BeautifulSoup import BeautifulSoup
-
-from datetime import datetime
-
-import utils
-import mongodb
-
-feeds_list = []
-list_of_threads = []
-
-
-class FeedGetter(object):
- """
- This class is in charge of retrieving feeds listed in ./var/feed.lst.
- This class uses feedparser module from Mark Pilgrim.
- For each feed a new thread is launched.
- """
- def __init__(self):
- """
- Initializes the base and variables.
- """
- # MongoDB connections
- self.articles = mongodb.Articles()
-
- def retrieve_feed(self):
- """
- Parse the file 'feeds.lst' and launch a thread for each RSS feed.
- """
- with open("./var/feed.lst") as f:
- for a_feed in f:
- # test if the URL is well formed
- for url_regexp in utils.url_finders:
- if url_regexp.match(a_feed):
- the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "")
- try:
- # launch a new thread for the RSS feed
- thread = threading.Thread(None, self.process, \
- None, (the_good_url,))
- thread.start()
- list_of_threads.append(thread)
- except:
- pass
- break
-
- # wait for all threads are done
- for th in list_of_threads:
- th.join()
-
- def process(self, the_good_url):
- """Request the URL
-
- Executed in a thread.
- """
- if utils.detect_url_errors([the_good_url]) == []:
- # if ressource is available add the articles in the base.
- self.add_into_database(the_good_url)
-
- def add_into_database(self, feed_link):
- """
- Add the articles of the feed 'a_feed' in the SQLite base.
- """
- a_feed = feedparser.parse(feed_link)
- if a_feed['entries'] == []:
- return
- try:
- feed_image = a_feed.feed.image.href
- except:
- feed_image = "/img/feed-icon-28x28.png"
-
- sha1_hash = hashlib.sha1()
- sha1_hash.update(feed_link.encode('utf-8'))
- feed_id = sha1_hash.hexdigest()
-
- collection_dic = {"feed_id": feed_id, \
- "type": 0, \
- "feed_image": feed_image, \
- "feed_title": utils.clear_string(a_feed.feed.title.encode('utf-8')), \
- "feed_link": feed_link, \
- "site_link": a_feed.feed.link.encode('utf-8'), \
- "mail": False \
- }
-
- self.articles.add_collection(collection_dic)
-
- articles = []
- for article in a_feed['entries']:
- description = ""
- try:
- # article content
- description = article.content[0].value
- except AttributeError:
- try:
- # article description
- description = article.description
- except Exception, e:
- description = ""
- description = str(BeautifulSoup(description))
- article_title = str(BeautifulSoup(article.title))
-
- try:
- post_date = datetime(*article.updated_parsed[:6])
- except:
- post_date = datetime(*article.published_parsed[:6])
-
-
- sha1_hash = hashlib.sha1()
- sha1_hash.update(article.link.encode('utf-8'))
- article_id = sha1_hash.hexdigest()
-
- article = {"article_id": article_id, \
- "type":1, \
- "article_date": post_date, \
- "article_link": article.link.encode('utf-8'), \
- "article_title": article_title, \
- "article_content": description, \
- "article_readed": False, \
- "article_like": False \
- }
-
- articles.append(article)
-
- self.articles.add_articles(articles, feed_id)
-
- # send new articles by e-mail if desired.
- #threading.Thread(None, utils.send_mail, None, (utils.mail_from, utils.mail_to, \
- #a_feed.feed.title.encode('utf-8'), \
- #article_title, description) \
- #).start()
-
-
-
-if __name__ == "__main__":
- # Point of entry in execution mode
- feed_getter = FeedGetter()
- feed_getter.retrieve_feed()
bgstack15