diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2020-03-24 15:45:27 +0100 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2020-03-24 15:45:27 +0100 |
commit | 174e73b64f9660ab8a7c2ca700185c67f995c293 (patch) | |
tree | 04f9d6d22e3b9e74ab756374b579c23fb872659f | |
parent | Updated CHANGELOG. (diff) | |
download | newspipe-174e73b64f9660ab8a7c2ca700185c67f995c293.tar.gz newspipe-174e73b64f9660ab8a7c2ca700185c67f995c293.tar.bz2 newspipe-174e73b64f9660ab8a7c2ca700185c67f995c293.zip |
fixes for the crawler.
-rw-r--r-- | newspipe/crawler/default_crawler.py | 11 | ||||
-rw-r--r-- | newspipe/lib/article_utils.py | 10 |
2 files changed, 18 insertions, 3 deletions
diff --git a/newspipe/crawler/default_crawler.py b/newspipe/crawler/default_crawler.py index 3d6222e9..b1153582 100644 --- a/newspipe/crawler/default_crawler.py +++ b/newspipe/crawler/default_crawler.py @@ -169,7 +169,16 @@ async def retrieve_feed(queue, users, feed_id=None): filters["last_retrieved__lt"] = datetime.now() - timedelta( minutes=application.config["FEED_REFRESH_INTERVAL"] ) - feeds = FeedController().read(**filters).all() + #feeds = FeedController().read(**filters).all() + feeds = [] # temporary fix for: sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL SYSCALL error: EOF detected + for feed in user.feeds: + if not feed.enabled: + continue + if feed.error_count > application.config["DEFAULT_MAX_ERROR"]: + continue + if feed.last_retrieved > (datetime.now() - timedelta(minutes=application.config["FEED_REFRESH_INTERVAL"])): + continue + feeds.append(feed) if feeds == []: logger.info("No feed to retrieve for {}".format(user.nickname)) diff --git a/newspipe/lib/article_utils.py b/newspipe/lib/article_utils.py index 3f6ee2ba..ec074fa9 100644 --- a/newspipe/lib/article_utils.py +++ b/newspipe/lib/article_utils.py @@ -19,7 +19,12 @@ PROCESSED_DATE_KEYS = {"published", "created", "updated"} def extract_id(entry): """ extract a value from an entry that will identify it among the other of that feed""" - return entry.get("entry_id") or entry.get("id") or entry["link"] + entry_id = 'undefined' + try: + entry_id = entry.get("entry_id") or entry.get("id") or entry["link"] + except: + pass + return entry_id async def construct_article(entry, feed, fields=None, fetch=True): @@ -85,12 +90,13 @@ async def get_article_details(entry, fetch=True): ): try: # resolves URL behind proxies (like feedproxy.google.com) + print('trying to resolve URL...') response = await newspipe_get(article_link, timeout=5) except MissingSchema: split, failed = urlsplit(article_link), False for scheme in "https", "http": - new_link = urlunsplit(SplitResult(scheme, *split[1:])) try: + new_link = urlunsplit(SplitResult(scheme, *split[1:])) response = await newspipe_get(new_link, timeout=5) except Exception: failed = True |