From 4b8e533cd591c67ad47d4e8be37e27fbd51af09a Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Sun, 29 Mar 2020 22:48:15 +0200 Subject: Minor fixes to the crawler. --- instance/sqlite.py | 2 +- newspipe/crawler/default_crawler.py | 8 +++++--- newspipe/lib/article_utils.py | 4 ++-- newspipe/lib/feed_utils.py | 2 ++ newspipe/lib/utils.py | 2 +- newspipe/templates/home.html | 9 ++++++--- 6 files changed, 17 insertions(+), 10 deletions(-) diff --git a/instance/sqlite.py b/instance/sqlite.py index 18b906dd..60047080 100644 --- a/instance/sqlite.py +++ b/instance/sqlite.py @@ -42,7 +42,7 @@ CRAWLER_USER_AGENT = "Newspipe (https://git.sr.ht/~cedric/newspipe)" CRAWLER_TIMEOUT = 30 CRAWLER_RESOLV = False RESOLVE_ARTICLE_URL = False -FEED_REFRESH_INTERVAL = 120 +FEED_REFRESH_INTERVAL = 0 # Notification MAIL_SERVER = "localhost" diff --git a/newspipe/crawler/default_crawler.py b/newspipe/crawler/default_crawler.py index b1153582..d76ca4fa 100644 --- a/newspipe/crawler/default_crawler.py +++ b/newspipe/crawler/default_crawler.py @@ -61,7 +61,7 @@ async def parse_feed(user, feed): # with (await sem): try: logger.info("Retrieving feed {}".format(feed.link)) - resp = await newspipe_get(feed.link, timeout=5) + resp = newspipe_get(feed.link, timeout=5) except Exception: logger.info("Problem when reading feed {}".format(feed.link)) return @@ -117,8 +117,8 @@ async def insert_articles(queue, nḅ_producers=1): if item is None: nb_producers_done += 1 if nb_producers_done == nḅ_producers: - print("All producers done.") - print("Process finished.") + logger.info("All producers done.") + logger.info("Process finished.") break continue @@ -179,6 +179,8 @@ async def retrieve_feed(queue, users, feed_id=None): if feed.last_retrieved > (datetime.now() - timedelta(minutes=application.config["FEED_REFRESH_INTERVAL"])): continue feeds.append(feed) + if feed_id and feed_id == feed.id: + break if feeds == []: logger.info("No feed to retrieve for {}".format(user.nickname)) diff --git a/newspipe/lib/article_utils.py b/newspipe/lib/article_utils.py index 00023fd7..0490d4d7 100644 --- a/newspipe/lib/article_utils.py +++ b/newspipe/lib/article_utils.py @@ -90,13 +90,13 @@ async def get_article_details(entry, fetch=True): ): try: # resolves URL behind proxies (like feedproxy.google.com) - response = await newspipe_get(article_link, timeout=5) + response = newspipe_get(article_link, timeout=5) except MissingSchema: split, failed = urlsplit(article_link), False for scheme in "https", "http": try: new_link = urlunsplit(SplitResult(scheme, *split[1:])) - response = await newspipe_get(new_link, timeout=5) + response = newspipe_get(new_link, timeout=5) except Exception: failed = True continue diff --git a/newspipe/lib/feed_utils.py b/newspipe/lib/feed_utils.py index 995bfaae..70ded817 100644 --- a/newspipe/lib/feed_utils.py +++ b/newspipe/lib/feed_utils.py @@ -55,6 +55,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): except Exception: logger.exception("failed to retrieve that url") fp_parsed = {"bozo": True} + assert url is not None and fp_parsed is not None feed = feed or {} feed_split = urllib.parse.urlsplit(url) @@ -113,6 +114,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): return wrapper if not feed.get("icon_url"): + icons = bs_parsed.find_all(check_keys(rel=["icon", "shortcut"])) if not len(icons): icons = bs_parsed.find_all(check_keys(rel=["icon"])) diff --git a/newspipe/lib/utils.py b/newspipe/lib/utils.py index 3d6bf0b8..7e6f3cf4 100644 --- a/newspipe/lib/utils.py +++ b/newspipe/lib/utils.py @@ -90,7 +90,7 @@ def redirect_url(default="home"): return request.args.get("next") or request.referrer or url_for(default) -async def newspipe_get(url, **kwargs): +def newspipe_get(url, **kwargs): request_kwargs = { "verify": False, "allow_redirects": True, diff --git a/newspipe/templates/home.html b/newspipe/templates/home.html index c7340f63..74180bc9 100644 --- a/newspipe/templates/home.html +++ b/newspipe/templates/home.html @@ -33,7 +33,7 @@ 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {% endif %} {{ nbunread }} - + {{ feeds[fid].title | safe | truncate(25, True) }} {% if feed_id == fid %}{% endif %} @@ -52,7 +52,7 @@ 2 else "orange" }} ;" class="badge pull-right" title="Some errors occured while trying to retrieve that feed.">{{ in_error[fid] }} {% endif %} {% if feed_id == fid %}{% endif %} - + {{ feed.title | safe | truncate(25, True) }} {% if feed_id == fid %}{% endif %} @@ -126,7 +126,10 @@ {% endif %} {% if not feed_id %} - {{ article.source.title | safe }} + + + {{ article.source.title | safe }} + {% endif %} {{ article.title | safe }} -- cgit