aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2020-03-24 15:45:27 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2020-03-24 15:45:27 +0100
commit174e73b64f9660ab8a7c2ca700185c67f995c293 (patch)
tree04f9d6d22e3b9e74ab756374b579c23fb872659f
parentUpdated CHANGELOG. (diff)
downloadnewspipe-174e73b64f9660ab8a7c2ca700185c67f995c293.tar.gz
newspipe-174e73b64f9660ab8a7c2ca700185c67f995c293.tar.bz2
newspipe-174e73b64f9660ab8a7c2ca700185c67f995c293.zip
fixes for the crawler.
-rw-r--r--newspipe/crawler/default_crawler.py11
-rw-r--r--newspipe/lib/article_utils.py10
2 files changed, 18 insertions, 3 deletions
diff --git a/newspipe/crawler/default_crawler.py b/newspipe/crawler/default_crawler.py
index 3d6222e9..b1153582 100644
--- a/newspipe/crawler/default_crawler.py
+++ b/newspipe/crawler/default_crawler.py
@@ -169,7 +169,16 @@ async def retrieve_feed(queue, users, feed_id=None):
filters["last_retrieved__lt"] = datetime.now() - timedelta(
minutes=application.config["FEED_REFRESH_INTERVAL"]
)
- feeds = FeedController().read(**filters).all()
+ #feeds = FeedController().read(**filters).all()
+ feeds = [] # temporary fix for: sqlalchemy.exc.OperationalError: (psycopg2.OperationalError) SSL SYSCALL error: EOF detected
+ for feed in user.feeds:
+ if not feed.enabled:
+ continue
+ if feed.error_count > application.config["DEFAULT_MAX_ERROR"]:
+ continue
+ if feed.last_retrieved > (datetime.now() - timedelta(minutes=application.config["FEED_REFRESH_INTERVAL"])):
+ continue
+ feeds.append(feed)
if feeds == []:
logger.info("No feed to retrieve for {}".format(user.nickname))
diff --git a/newspipe/lib/article_utils.py b/newspipe/lib/article_utils.py
index 3f6ee2ba..ec074fa9 100644
--- a/newspipe/lib/article_utils.py
+++ b/newspipe/lib/article_utils.py
@@ -19,7 +19,12 @@ PROCESSED_DATE_KEYS = {"published", "created", "updated"}
def extract_id(entry):
""" extract a value from an entry that will identify it among the other of
that feed"""
- return entry.get("entry_id") or entry.get("id") or entry["link"]
+ entry_id = 'undefined'
+ try:
+ entry_id = entry.get("entry_id") or entry.get("id") or entry["link"]
+ except:
+ pass
+ return entry_id
async def construct_article(entry, feed, fields=None, fetch=True):
@@ -85,12 +90,13 @@ async def get_article_details(entry, fetch=True):
):
try:
# resolves URL behind proxies (like feedproxy.google.com)
+ print('trying to resolve URL...')
response = await newspipe_get(article_link, timeout=5)
except MissingSchema:
split, failed = urlsplit(article_link), False
for scheme in "https", "http":
- new_link = urlunsplit(SplitResult(scheme, *split[1:]))
try:
+ new_link = urlunsplit(SplitResult(scheme, *split[1:]))
response = await newspipe_get(new_link, timeout=5)
except Exception:
failed = True
bgstack15