aboutsummaryrefslogtreecommitdiff
path: root/src/crawler
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-22 07:53:12 +0200
committerCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-22 07:53:12 +0200
commit7540a5ce98f3805952c9124fed63f2cd55049071 (patch)
tree0d1c68ab42301beceb3067885fcc42eff051c60d /src/crawler
parentUpdated dependencies. (diff)
downloadnewspipe-7540a5ce98f3805952c9124fed63f2cd55049071.tar.gz
newspipe-7540a5ce98f3805952c9124fed63f2cd55049071.tar.bz2
newspipe-7540a5ce98f3805952c9124fed63f2cd55049071.zip
Feeds are now retrieved with requests. feedparser is only used for the parsing.
Diffstat (limited to 'src/crawler')
-rw-r--r--src/crawler/default_crawler.py24
1 files changed, 16 insertions, 8 deletions
diff --git a/src/crawler/default_crawler.py b/src/crawler/default_crawler.py
index e511eb9d..3a0f72b1 100644
--- a/src/crawler/default_crawler.py
+++ b/src/crawler/default_crawler.py
@@ -26,8 +26,10 @@ __revision__ = "$Date: 2010/05/21 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "AGPLv3"
+import io
import asyncio
import logging
+import requests
import feedparser
import dateutil.parser
from datetime import datetime, timezone, timedelta
@@ -50,10 +52,17 @@ def get(*args, **kwargs):
#kwargs["connector"] = aiohttp.TCPConnector(verify_ssl=False)
try:
logger.info('Retrieving feed {}'.format(args[0]))
- data = feedparser.parse(args[0])
- return data
+ resp = requests.get(args[0], timeout=20.0)
+ except requests.ReadTimeout:
+ logger.info('Timeout when reading feed {}'.format(args[0]))
+ raise e
+ content = io.BytesIO(resp.content)
+ try:
+ data = feedparser.parse(content)
except Exception as e:
raise e
+ return data
+
def parse_feed(user, feed):
@@ -119,8 +128,8 @@ async def insert_articles(queue, nḅ_producers=1):
continue
user, feed, articles = item
-
-
+
+
if None is articles:
logger.info('None')
articles = []
@@ -166,15 +175,14 @@ async def retrieve_feed(queue, users, feed_id=None):
filters['last_retrieved__lt'] = datetime.now() - \
timedelta(minutes=conf.FEED_REFRESH_INTERVAL)
feeds = FeedController().read(**filters).all()
-
+
if feeds == []:
logger.info('No feed to retrieve for {}'.format(user.nickname))
-
-
+
+
for feed in feeds:
articles = parse_feed(user, feed)
await queue.put((user, feed, articles))
await queue.put(None)
-
bgstack15