aboutsummaryrefslogtreecommitdiff
path: root/src/crawler/default_crawler.py
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-23 08:39:02 +0200
committerCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-23 08:39:02 +0200
commit2dfb4c44eb983c057c1146943e503f8f6979777b (patch)
treed2bfa3018ed31b1b2de251dea98db5c30fa8f643 /src/crawler/default_crawler.py
parentRmoved list of created articles. (diff)
downloadnewspipe-2dfb4c44eb983c057c1146943e503f8f6979777b.tar.gz
newspipe-2dfb4c44eb983c057c1146943e503f8f6979777b.tar.bz2
newspipe-2dfb4c44eb983c057c1146943e503f8f6979777b.zip
Improved default crawler.
Diffstat (limited to 'src/crawler/default_crawler.py')
-rw-r--r--src/crawler/default_crawler.py60
1 files changed, 25 insertions, 35 deletions
diff --git a/src/crawler/default_crawler.py b/src/crawler/default_crawler.py
index 543724bc..148bddaa 100644
--- a/src/crawler/default_crawler.py
+++ b/src/crawler/default_crawler.py
@@ -29,7 +29,6 @@ __license__ = "AGPLv3"
import io
import asyncio
import logging
-import requests
import feedparser
import dateutil.parser
from datetime import datetime, timezone, timedelta
@@ -39,6 +38,7 @@ import conf
from bootstrap import db
from web.models import User
from web.controllers import FeedController, ArticleController
+from lib.utils import jarr_get
from lib.feed_utils import construct_feed_from, is_parsing_ok
from lib.article_utils import construct_article, extract_id, \
get_article_content
@@ -48,24 +48,7 @@ logger = logging.getLogger(__name__)
sem = asyncio.Semaphore(5)
-def get(*args, **kwargs):
- #kwargs["connector"] = aiohttp.TCPConnector(verify_ssl=False)
- try:
- logger.info('Retrieving feed {}'.format(args[0]))
- resp = requests.get(args[0], timeout=20.0)
- except requests.ReadTimeout:
- logger.info('Timeout when reading feed {}'.format(args[0]))
- raise e
- content = io.BytesIO(resp.content)
- try:
- data = feedparser.parse(content)
- except Exception as e:
- raise e
- return data
-
-
-
-def parse_feed(user, feed):
+async def parse_feed(user, feed):
"""
Fetch a feed.
Update the feed and return the articles.
@@ -73,21 +56,30 @@ def parse_feed(user, feed):
parsed_feed = None
up_feed = {}
articles = []
+ resp = None
#with (await sem):
try:
- parsed_feed = get(feed.link)
+ logger.info('Retrieving feed {}'.format(feed.link))
+ resp = await jarr_get(feed.link, timeout=5)
except Exception as e:
- up_feed['last_error'] = str(e)
- up_feed['error_count'] = feed.error_count + 1
- logger.exception("error when parsing feed: " + str(e))
+ logger.info('Problem when reading feed {}'.format(feed.link))
+ raise e
finally:
- up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal())
- if parsed_feed is None:
- try:
- FeedController().update({'id': feed.id}, up_feed)
- except Exception as e:
- logger.exception('something bad here: ' + str(e))
- return
+ try:
+ content = io.BytesIO(resp.content)
+ parsed_feed = feedparser.parse(content)
+ except Exception as e:
+ up_feed['last_error'] = str(e)
+ up_feed['error_count'] = feed.error_count + 1
+ logger.exception("error when parsing feed: " + str(e))
+ finally:
+ up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal())
+ if parsed_feed is None:
+ try:
+ FeedController().update({'id': feed.id}, up_feed)
+ except Exception as e:
+ logger.exception('something bad here: ' + str(e))
+ return
if not is_parsing_ok(parsed_feed):
up_feed['last_error'] = str(parsed_feed['bozo_exception'])
@@ -134,7 +126,7 @@ async def insert_articles(queue, nḅ_producers=1):
logger.info('None')
articles = []
- logger.info('Inserting articles for {}'.format(feed.title))
+ logger.info('Inserting articles for {}'.format(feed.link))
art_contr = ArticleController(user.id)
for article in articles:
@@ -155,7 +147,7 @@ async def insert_articles(queue, nḅ_producers=1):
art_contr.create(**new_article)
logger.info('New article added: {}'.format(new_article['link']))
except Exception:
- logger.exception('Error when inserting article in database:')
+ logger.exception('Error when inserting article in database.')
continue
@@ -175,13 +167,11 @@ async def retrieve_feed(queue, users, feed_id=None):
timedelta(minutes=conf.FEED_REFRESH_INTERVAL)
feeds = FeedController().read(**filters).all()
-
if feeds == []:
logger.info('No feed to retrieve for {}'.format(user.nickname))
-
for feed in feeds:
- articles = parse_feed(user, feed)
+ articles = await parse_feed(user, feed)
await queue.put((user, feed, articles))
await queue.put(None)
bgstack15