From 71b185bf1984080077937a814a0d19d70faf2f77 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Fri, 8 Jan 2016 16:30:09 +0100 Subject: using user agent in web crawler --- src/conf.py | 3 +-- src/web/lib/crawler.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'src') diff --git a/src/conf.py b/src/conf.py index a3e7e3bb..7db65fd1 100644 --- a/src/conf.py +++ b/src/conf.py @@ -33,8 +33,7 @@ DEFAULTS = {"platform_url": "https://JARR.herokuapp.com/", "default_max_error": "3", "log_path": "jarr.log", "log_level": "info", - "user_agent": "JARR " - "(https://github.com/JARR-aggregator)", + "user_agent": "JARR (https://github.com/JARR-aggregator)", "resolve_article_url": "false", "http_proxy": "", "secret": "", diff --git a/src/web/lib/crawler.py b/src/web/lib/crawler.py index 90a268e8..979ccbfc 100644 --- a/src/web/lib/crawler.py +++ b/src/web/lib/crawler.py @@ -52,7 +52,7 @@ class AbstractCrawler: auth=self.auth, data=json.dumps(data, default=default_handler), headers={'Content-Type': 'application/json', - 'User-Agent': 'jarr'}) + 'User-Agent': conf.USER_AGENT}) def wait(self, max_wait=300, checks=5, wait_for=2): checked, second_waited = 0, 0 @@ -217,7 +217,7 @@ class CrawlerScheduler(AbstractCrawler): def prepare_headers(self, feed): """For a known feed, will construct some header dictionnary""" - headers = {'User-Agent': 'jarr/crawler'} + headers = {'User-Agent': conf.USER_AGENT} if feed.get('last_modified'): headers['If-Modified-Since'] = feed['last_modified'] if feed.get('etag') and 'pyagg' not in feed['etag']: -- cgit From 8a72e6ae786ffde8e841afbe725a19b3b874f87e Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Mon, 11 Jan 2016 10:16:41 +0100 Subject: fixing stuffs * no more warning on constructing feeds * using the configured user agent for constructing feed * regrouping the logic behind knowing if the parsing of a feed worked --- src/crawler.py | 4 ++-- src/web/lib/feed_utils.py | 13 ++++++++++--- 2 files changed, 12 insertions(+), 5 deletions(-) (limited to 'src') diff --git a/src/crawler.py b/src/crawler.py index 22e73754..1a759945 100644 --- a/src/crawler.py +++ b/src/crawler.py @@ -37,7 +37,7 @@ import conf from bootstrap import db from web.models import User from web.controllers import FeedController, ArticleController -from web.lib.feed_utils import construct_feed_from +from web.lib.feed_utils import construct_feed_from, is_parsing_ok from web.lib.article_utils import construct_article, extract_id logger = logging.getLogger(__name__) @@ -87,7 +87,7 @@ async def parse_feed(user, feed): FeedController().update({'id': feed.id}, up_feed) return - if parsed_feed['bozo'] == 1 and parsed_feed['entries'] == []: + if not is_parsing_ok(parsed_feed): up_feed['last_error'] = str(parsed_feed['bozo_exception']) up_feed['error_count'] = feed.error_count + 1 FeedController().update({'id': feed.id}, up_feed) diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py index f3b18224..14e6b82b 100644 --- a/src/web/lib/feed_utils.py +++ b/src/web/lib/feed_utils.py @@ -2,19 +2,26 @@ import urllib import logging import requests import feedparser +from conf import USER_AGENT from bs4 import BeautifulSoup, SoupStrainer from web.lib.utils import try_keys, try_get_icon_url, rebuild_url logger = logging.getLogger(__name__) +logging.captureWarnings(True) + + +def is_parsing_ok(parsed_feed): + return parsed_feed['entries'] or not parsed_feed['bozo'] def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): + requests_kwargs = {'headers': {'User-Agent': USER_AGENT}, 'verify': False} if url is None and fp_parsed is not None: url = fp_parsed.get('url') if url is not None and fp_parsed is None: try: - response = requests.get(url, verify=False) + response = requests.get(url, **requests_kwargs) fp_parsed = feedparser.parse(response.content, request_headers=response.headers) except Exception: @@ -24,7 +31,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): feed = feed or {} feed_split = urllib.parse.urlsplit(url) site_split = None - if not fp_parsed['bozo']: + if is_parsing_ok(fp_parsed): feed['link'] = url feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link') feed['title'] = fp_parsed['feed'].get('title') @@ -48,7 +55,7 @@ def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): return feed try: - response = requests.get(feed['site_link'], verify=False) + response = requests.get(feed['site_link'], **requests_kwargs) except Exception: logger.exception('failed to retreive %r', feed['site_link']) return feed -- cgit