aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCédric Bonhomme <kimble.mandel@gmail.com>2013-01-20 00:57:21 +0100
committerCédric Bonhomme <kimble.mandel@gmail.com>2013-01-20 00:57:21 +0100
commit1c0ad161a8c50bf4d3c298e498fab889deb3af87 (patch)
treea0e7423f6457041521252228ec0ef4442a4c5f08
parentimprovements... (diff)
downloadnewspipe-1c0ad161a8c50bf4d3c298e498fab889deb3af87.tar.gz
newspipe-1c0ad161a8c50bf4d3c298e498fab889deb3af87.tar.bz2
newspipe-1c0ad161a8c50bf4d3c298e498fab889deb3af87.zip
Added some tricks to get all posts of a blog.
-rwxr-xr-xsource/feedgetter.py74
1 files changed, 50 insertions, 24 deletions
diff --git a/source/feedgetter.py b/source/feedgetter.py
index 2fe85bb5..b6b5e8b6 100755
--- a/source/feedgetter.py
+++ b/source/feedgetter.py
@@ -53,40 +53,43 @@ class FeedGetter(object):
self.articles = mongodb.Articles(conf.MONGODB_ADDRESS, conf.MONGODB_PORT, \
conf.MONGODB_DBNAME, conf.MONGODB_USER, conf.MONGODB_PASSWORD)
- def retrieve_feed(self):
+ def retrieve_feed(self, feed_url=None, feed_original=None):
"""
Parse the file 'feeds.lst' and launch a thread for each RSS feed.
"""
- with open(conf.FEED_LIST) as f:
- for a_feed in f:
- # test if the URL is well formed
- for url_regexp in utils.url_finders:
- if url_regexp.match(a_feed):
- the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "")
- try:
- # launch a new thread for the RSS feed
- thread = threading.Thread(None, self.process, \
- None, (the_good_url,))
- thread.start()
- list_of_threads.append(thread)
- except:
- pass
- break
-
- # wait for all threads are done
- for th in list_of_threads:
- th.join()
-
- def process(self, the_good_url):
+ if feed_url != None:
+ self.process(feed_url, feed_original)
+ else:
+ with open(conf.FEED_LIST) as f:
+ for a_feed in f:
+ # test if the URL is well formed
+ for url_regexp in utils.url_finders:
+ if url_regexp.match(a_feed):
+ the_good_url = url_regexp.match(a_feed).group(0).replace("\n", "")
+ try:
+ # launch a new thread for the RSS feed
+ thread = threading.Thread(None, self.process, \
+ None, (the_good_url,))
+ thread.start()
+ list_of_threads.append(thread)
+ except:
+ pass
+ break
+
+ # wait for all threads are done
+ for th in list_of_threads:
+ th.join()
+
+ def process(self, the_good_url, feed_original=None):
"""Request the URL
Executed in a thread.
"""
if utils.detect_url_errors([the_good_url]) == []:
# if ressource is available add the articles in the base.
- self.add_into_database(the_good_url)
+ self.add_into_database(the_good_url, feed_original)
- def add_into_database(self, feed_link):
+ def add_into_database(self, feed_link, feed_original=None):
"""
Add the articles of the feed 'a_feed' in the SQLite base.
"""
@@ -98,6 +101,9 @@ class FeedGetter(object):
except:
feed_image = "/img/feed-icon-28x28.png"
+ if feed_original != None:
+ feed_link = feed_original
+
sha1_hash = hashlib.sha1()
sha1_hash.update(feed_link.encode('utf-8'))
feed_id = sha1_hash.hexdigest()
@@ -161,3 +167,23 @@ if __name__ == "__main__":
# Point of entry in execution mode
feed_getter = FeedGetter()
feed_getter.retrieve_feed()
+
+ # If you want to get all articles of a blog:
+ """
+ for i in range(1,86):
+ feed_original = "http://esr.ibiblio.org/?feed=rss2"
+ feed = feed_original + "&paged=" + str(i)
+ print("Retrieving", feed, "...")
+ feed_getter.retrieve_feed(feed, feed_original)
+ """
+ """
+ for i in range(1,5):
+ feed_original = "http://spaf.wordpress.com/feed/"
+ feed = feed_original + "?paged=" + str(i)
+ print("Retrieving", feed, "...")
+ feed_getter.retrieve_feed(feed, feed_original)
+ """
+
+ # For a blogspot blog:
+ #feed_getter.retrieve_feed("http://www.blogger.com/feeds/4195135246107166251/posts/default", "http://neopythonic.blogspot.com/feeds/posts/default")
+ #feed_getter.retrieve_feed("http://www.blogger.com/feeds/8699431508730375743/posts/default", "http://python-history.blogspot.com/feeds/posts/default") \ No newline at end of file
bgstack15