From 71b185bf1984080077937a814a0d19d70faf2f77 Mon Sep 17 00:00:00 2001 From: François Schmidts Date: Fri, 8 Jan 2016 16:30:09 +0100 Subject: using user agent in web crawler --- src/conf.py | 3 +-- src/web/lib/crawler.py | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/src/conf.py b/src/conf.py index a3e7e3bb..7db65fd1 100644 --- a/src/conf.py +++ b/src/conf.py @@ -33,8 +33,7 @@ DEFAULTS = {"platform_url": "https://JARR.herokuapp.com/", "default_max_error": "3", "log_path": "jarr.log", "log_level": "info", - "user_agent": "JARR " - "(https://github.com/JARR-aggregator)", + "user_agent": "JARR (https://github.com/JARR-aggregator)", "resolve_article_url": "false", "http_proxy": "", "secret": "", diff --git a/src/web/lib/crawler.py b/src/web/lib/crawler.py index 90a268e8..979ccbfc 100644 --- a/src/web/lib/crawler.py +++ b/src/web/lib/crawler.py @@ -52,7 +52,7 @@ class AbstractCrawler: auth=self.auth, data=json.dumps(data, default=default_handler), headers={'Content-Type': 'application/json', - 'User-Agent': 'jarr'}) + 'User-Agent': conf.USER_AGENT}) def wait(self, max_wait=300, checks=5, wait_for=2): checked, second_waited = 0, 0 @@ -217,7 +217,7 @@ class CrawlerScheduler(AbstractCrawler): def prepare_headers(self, feed): """For a known feed, will construct some header dictionnary""" - headers = {'User-Agent': 'jarr/crawler'} + headers = {'User-Agent': conf.USER_AGENT} if feed.get('last_modified'): headers['If-Modified-Since'] = feed['last_modified'] if feed.get('etag') and 'pyagg' not in feed['etag']: -- cgit