aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2016-01-11 15:22:26 +0100
committerCédric Bonhomme <cedric@cedricbonhomme.org>2016-01-11 15:22:26 +0100
commit6d0e7c8fe0da4ce96fb9c7df3ab9c970f3a2913d (patch)
tree36d6ff44e909aa00bcb52adbe1f2fe5ede1f53f3
parentMoved the crawler in the parent folder. (diff)
parentfixing stuffs (diff)
downloadnewspipe-6d0e7c8fe0da4ce96fb9c7df3ab9c970f3a2913d.tar.gz
newspipe-6d0e7c8fe0da4ce96fb9c7df3ab9c970f3a2913d.tar.bz2
newspipe-6d0e7c8fe0da4ce96fb9c7df3ab9c970f3a2913d.zip
Merge pull request #30 from jaesivsm/master
misc fixes on crawlers and use of User-Agent
-rw-r--r--src/conf.py3
-rw-r--r--src/crawler.py4
-rw-r--r--src/web/lib/crawler.py4
-rw-r--r--src/web/lib/feed_utils.py13
4 files changed, 15 insertions, 9 deletions
diff --git a/src/conf.py b/src/conf.py
index a3e7e3bb..7db65fd1 100644
--- a/src/conf.py
+++ b/src/conf.py
@@ -33,8 +33,7 @@ DEFAULTS = {"platform_url": "https://JARR.herokuapp.com/",
"default_max_error": "3",
"log_path": "jarr.log",
"log_level": "info",
- "user_agent": "JARR "
- "(https://github.com/JARR-aggregator)",
+ "user_agent": "JARR (https://github.com/JARR-aggregator)",
"resolve_article_url": "false",
"http_proxy": "",
"secret": "",
diff --git a/src/crawler.py b/src/crawler.py
index 22e73754..1a759945 100644
--- a/src/crawler.py
+++ b/src/crawler.py
@@ -37,7 +37,7 @@ import conf
from bootstrap import db
from web.models import User
from web.controllers import FeedController, ArticleController
-from web.lib.feed_utils import construct_feed_from
+from web.lib.feed_utils import construct_feed_from, is_parsing_ok
from web.lib.article_utils import construct_article, extract_id
logger = logging.getLogger(__name__)
@@ -87,7 +87,7 @@ async def parse_feed(user, feed):
FeedController().update({'id': feed.id}, up_feed)
return
- if parsed_feed['bozo'] == 1 and parsed_feed['entries'] == []:
+ if not is_parsing_ok(parsed_feed):
up_feed['last_error'] = str(parsed_feed['bozo_exception'])
up_feed['error_count'] = feed.error_count + 1
FeedController().update({'id': feed.id}, up_feed)
diff --git a/src/web/lib/crawler.py b/src/web/lib/crawler.py
index 90a268e8..979ccbfc 100644
--- a/src/web/lib/crawler.py
+++ b/src/web/lib/crawler.py
@@ -52,7 +52,7 @@ class AbstractCrawler:
auth=self.auth, data=json.dumps(data,
default=default_handler),
headers={'Content-Type': 'application/json',
- 'User-Agent': 'jarr'})
+ 'User-Agent': conf.USER_AGENT})
def wait(self, max_wait=300, checks=5, wait_for=2):
checked, second_waited = 0, 0
@@ -217,7 +217,7 @@ class CrawlerScheduler(AbstractCrawler):
def prepare_headers(self, feed):
"""For a known feed, will construct some header dictionnary"""
- headers = {'User-Agent': 'jarr/crawler'}
+ headers = {'User-Agent': conf.USER_AGENT}
if feed.get('last_modified'):
headers['If-Modified-Since'] = feed['last_modified']
if feed.get('etag') and 'pyagg' not in feed['etag']:
diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py
index f3b18224..14e6b82b 100644
--- a/src/web/lib/feed_utils.py
+++ b/src/web/lib/feed_utils.py
@@ -2,19 +2,26 @@ import urllib
import logging
import requests
import feedparser
+from conf import USER_AGENT
from bs4 import BeautifulSoup, SoupStrainer
from web.lib.utils import try_keys, try_get_icon_url, rebuild_url
logger = logging.getLogger(__name__)
+logging.captureWarnings(True)
+
+
+def is_parsing_ok(parsed_feed):
+ return parsed_feed['entries'] or not parsed_feed['bozo']
def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
+ requests_kwargs = {'headers': {'User-Agent': USER_AGENT}, 'verify': False}
if url is None and fp_parsed is not None:
url = fp_parsed.get('url')
if url is not None and fp_parsed is None:
try:
- response = requests.get(url, verify=False)
+ response = requests.get(url, **requests_kwargs)
fp_parsed = feedparser.parse(response.content,
request_headers=response.headers)
except Exception:
@@ -24,7 +31,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
feed = feed or {}
feed_split = urllib.parse.urlsplit(url)
site_split = None
- if not fp_parsed['bozo']:
+ if is_parsing_ok(fp_parsed):
feed['link'] = url
feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
feed['title'] = fp_parsed['feed'].get('title')
@@ -48,7 +55,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
return feed
try:
- response = requests.get(feed['site_link'], verify=False)
+ response = requests.get(feed['site_link'], **requests_kwargs)
except Exception:
logger.exception('failed to retreive %r', feed['site_link'])
return feed
bgstack15