From 2d72f44a90a76fe7450e59fdfdf4d42f44b9cd96 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Tue, 8 Nov 2016 14:39:47 +0100 Subject: various improvements to the crawler (better use of coroutines, test if an article should be updated). tags are now retrieved for the k-means clustering (previously achived with the content of articles) --- src/web/lib/feed_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'src/web/lib/feed_utils.py') diff --git a/src/web/lib/feed_utils.py b/src/web/lib/feed_utils.py index 9925613f..94ae6e53 100644 --- a/src/web/lib/feed_utils.py +++ b/src/web/lib/feed_utils.py @@ -3,7 +3,7 @@ import urllib import logging import requests import feedparser -from conf import USER_AGENT +from conf import CRAWLER_USER_AGENT from bs4 import BeautifulSoup, SoupStrainer from web.lib.utils import try_keys, try_get_icon_url, rebuild_url @@ -32,7 +32,8 @@ def escape_keys(*keys): @escape_keys('title', 'description') def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True): - requests_kwargs = {'headers': {'User-Agent': USER_AGENT}, 'verify': False} + requests_kwargs = {'headers': {'User-Agent': CRAWLER_USER_AGENT}, + 'verify': False} if url is None and fp_parsed is not None: url = fp_parsed.get('url') if url is not None and fp_parsed is None: -- cgit