From ca476c4303445b1c92ef983b06eb32e2dc87a19d Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Mon, 30 Mar 2020 21:19:32 +0200 Subject: removed debug log --- newspipe/crawler/default_crawler.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/newspipe/crawler/default_crawler.py b/newspipe/crawler/default_crawler.py index 0584fb90..42b1450f 100644 --- a/newspipe/crawler/default_crawler.py +++ b/newspipe/crawler/default_crawler.py @@ -180,7 +180,7 @@ async def retrieve_feed(queue, users, feed_id=None): continue if None is feed_id or (feed_id and feed_id == feed.id): feeds.append(feed) - logger.info(feeds) + if feeds == []: logger.info("No feed to retrieve for {}".format(user.nickname)) -- cgit From 1c5189ab844f71cc65dcf5f1f214e0793976ff50 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Mon, 30 Mar 2020 21:50:27 +0200 Subject: catch exception with trying to fetch feed icon with requests --- newspipe/controllers/icon.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/newspipe/controllers/icon.py b/newspipe/controllers/icon.py index d5dd7fe6..b0fad5ac 100644 --- a/newspipe/controllers/icon.py +++ b/newspipe/controllers/icon.py @@ -2,6 +2,7 @@ import base64 import requests +from newspipe.lib.utils import newspipe_get from newspipe.models import Icon from .abstract import AbstractController @@ -13,14 +14,17 @@ class IconController(AbstractController): def _build_from_url(self, attrs): if "url" in attrs and "content" not in attrs: - resp = requests.get(attrs["url"], verify=False) - attrs.update( - { - "url": resp.url, - "mimetype": resp.headers.get("content-type", None), - "content": base64.b64encode(resp.content).decode("utf8"), - } - ) + try: + resp = newspipe_get(attrs["url"], timeout=5) + attrs.update( + { + "url": resp.url, + "mimetype": resp.headers.get("content-type", None), + "content": base64.b64encode(resp.content).decode("utf8"), + } + ) + except requests.exceptions.ConnectionError: + pass return attrs def create(self, **attrs): -- cgit From e68413912ba74e8d5f38fab45df20374a3214800 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Mon, 30 Mar 2020 22:01:51 +0200 Subject: catch ParseError form dateutil --- newspipe/lib/article_utils.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/newspipe/lib/article_utils.py b/newspipe/lib/article_utils.py index 0490d4d7..7e1b2a36 100644 --- a/newspipe/lib/article_utils.py +++ b/newspipe/lib/article_utils.py @@ -2,6 +2,7 @@ import html import logging import re from datetime import datetime, timezone +from dateutil.parser._parser import ParserError from enum import Enum from urllib.parse import SplitResult, urlsplit, urlunsplit @@ -47,6 +48,8 @@ async def construct_article(entry, feed, fields=None, fetch=True): article["date"] = dateutil.parser.parse(entry[date_key]).astimezone( timezone.utc ) + except ParserError: + logger.exception("Error when parsing date {}".format(entry[date_key])) except Exception as e: logger.exception(e) else: -- cgit From 80834199a4177db0f19263bfb76b0e87b102caf4 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Mon, 30 Mar 2020 22:45:33 +0200 Subject: wip --- newspipe/crawler/default_crawler.py | 5 ++++- newspipe/lib/article_utils.py | 4 ++-- newspipe/lib/feed_utils.py | 5 ++++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/newspipe/crawler/default_crawler.py b/newspipe/crawler/default_crawler.py index 42b1450f..a76eca9c 100644 --- a/newspipe/crawler/default_crawler.py +++ b/newspipe/crawler/default_crawler.py @@ -103,7 +103,10 @@ async def parse_feed(user, feed): if feed.title and "title" in up_feed: # do not override the title set by the user del up_feed["title"] - FeedController().update({"id": feed.id}, up_feed) + try: + FeedController().update({"id": feed.id}, up_feed) + except: + logger.exception("error when updating feed: {}".format(feed.link)) return articles diff --git a/newspipe/lib/article_utils.py b/newspipe/lib/article_utils.py index 7e1b2a36..d343f0a1 100644 --- a/newspipe/lib/article_utils.py +++ b/newspipe/lib/article_utils.py @@ -49,9 +49,9 @@ async def construct_article(entry, feed, fields=None, fetch=True): timezone.utc ) except ParserError: - logger.exception("Error when parsing date {}".format(entry[date_key])) + logger.exception("Error when parsing date: {}".format(entry[date_key])) except Exception as e: - logger.exception(e) + pass else: break push_in_article("content", get_article_content(entry)) diff --git a/newspipe/lib/feed_utils.py b/newspipe/lib/feed_utils.py index 70ded817..0de78580 100644 --- a/newspipe/lib/feed_utils.py +++ b/newspipe/lib/feed_utils.py @@ -42,6 +42,7 @@ def escape_keys(*keys): def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): requests_kwargs = { "headers": {"User-Agent": application.config["CRAWLER_USER_AGENT"]}, + "timeout": application.config["CRAWLER_TIMEOUT"], "verify": False, } if url is None and fp_parsed is not None: @@ -87,7 +88,9 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): try: response = requests.get(feed["site_link"], **requests_kwargs) - except requests.exceptions.InvalidSchema as e: + except requests.exceptions.InvalidSchema: + return feed + except requests.exceptions.ConnectionError: return feed except: logger.exception("failed to retrieve %r", feed["site_link"]) -- cgit