aboutsummaryrefslogtreecommitdiff
path: root/src/crawler
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-21 23:49:03 +0200
committerCédric Bonhomme <cedric@cedricbonhomme.org>2019-05-21 23:49:03 +0200
commit7d9bc6621eea035034cd56d20de9333f09387940 (patch)
treed9b985163e990012987b64c7d563c5110a9ff0ca /src/crawler
parentMerge branch 'master' of gitlab.com:newspipe/newspipe (diff)
downloadnewspipe-7d9bc6621eea035034cd56d20de9333f09387940.tar.gz
newspipe-7d9bc6621eea035034cd56d20de9333f09387940.tar.bz2
newspipe-7d9bc6621eea035034cd56d20de9333f09387940.zip
Improved crawler.
Diffstat (limited to 'src/crawler')
-rw-r--r--src/crawler/default_crawler.py224
1 files changed, 108 insertions, 116 deletions
diff --git a/src/crawler/default_crawler.py b/src/crawler/default_crawler.py
index 38810fc0..bba8431a 100644
--- a/src/crawler/default_crawler.py
+++ b/src/crawler/default_crawler.py
@@ -2,7 +2,7 @@
# -*- coding: utf-8 -
# newspipe - A Web based news aggregator.
-# Copyright (C) 2010-2018 Cédric Bonhomme - https://www.cedricbonhomme.org
+# Copyright (C) 2010-2019 Cédric Bonhomme - https://www.cedricbonhomme.org
#
# For more information : https://gitlab.com/newspipe/newspipe
#
@@ -20,9 +20,9 @@
# along with this program. If not, see <http://www.gnu.org/licenses/>.
__author__ = "Cedric Bonhomme"
-__version__ = "$Revision: 3.3 $"
+__version__ = "$Revision: 4.0 $"
__date__ = "$Date: 2010/09/02 $"
-__revision__ = "$Date: 2015/12/07 $"
+__revision__ = "$Date: 2010/05/21 $"
__copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "AGPLv3"
@@ -45,18 +45,8 @@ logger = logging.getLogger(__name__)
sem = asyncio.Semaphore(5)
-import ssl
-try:
- _create_unverified_https_context = ssl._create_unverified_context
-except AttributeError:
- # Legacy Python that doesn't verify HTTPS certificates by default
- pass
-else:
- # Handle target environment that doesn't support HTTPS verification
- ssl._create_default_https_context = _create_unverified_https_context
-
-async def get(*args, **kwargs):
+def get(*args, **kwargs):
#kwargs["connector"] = aiohttp.TCPConnector(verify_ssl=False)
try:
logger.info('Retrieving feed {}'.format(args[0]))
@@ -66,7 +56,7 @@ async def get(*args, **kwargs):
raise e
-async def parse_feed(user, feed):
+def parse_feed(user, feed):
"""
Fetch a feed.
Update the feed and return the articles.
@@ -74,21 +64,21 @@ async def parse_feed(user, feed):
parsed_feed = None
up_feed = {}
articles = []
- with (await sem):
- try:
- parsed_feed = await get(feed.link)
- except Exception as e:
- up_feed['last_error'] = str(e)
- up_feed['error_count'] = feed.error_count + 1
- logger.exception("error when parsing feed: " + str(e))
- finally:
- up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal())
- if parsed_feed is None:
- try:
- FeedController().update({'id': feed.id}, up_feed)
- except Exception as e:
- logger.exception('something bad here: ' + str(e))
- return
+ #with (await sem):
+ try:
+ parsed_feed = get(feed.link)
+ except Exception as e:
+ up_feed['last_error'] = str(e)
+ up_feed['error_count'] = feed.error_count + 1
+ logger.exception("error when parsing feed: " + str(e))
+ finally:
+ up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal())
+ if parsed_feed is None:
+ try:
+ FeedController().update({'id': feed.id}, up_feed)
+ except Exception as e:
+ logger.exception('something bad here: ' + str(e))
+ return
if not is_parsing_ok(parsed_feed):
up_feed['last_error'] = str(parsed_feed['bozo_exception'])
@@ -114,95 +104,97 @@ async def parse_feed(user, feed):
return articles
-async def insert_database(user, feed):
- articles = await parse_feed(user, feed)
- if None is articles:
- return []
-
- logger.info('Inserting articles for {}'.format(feed.title))
-
- new_articles = []
- art_contr = ArticleController(user.id)
- for article in articles:
- new_article = await construct_article(article, feed)
-
- try:
- existing_article_req = art_contr.read(feed_id=feed.id,
- entry_id=extract_id(article))
- except Exception as e:
- logger.exception("existing_article_req: " + str(e))
- continue
- exist = existing_article_req.count() != 0
- if exist:
- continue
- # if the article has been already retrieved, we only update
- # the content or the title
- logger.info('Article already in the database: {}'. \
- format(article['link']))
- existing_article = existing_article_req.first()
-
- if new_article['date'].replace(tzinfo=None) != \
- existing_article.date:
- existing_article.date = new_article['date']
- existing_article.updated_date = new_article['date']
- if existing_article.title != new_article['title']:
- existing_article.title = new_article['title']
- content = get_article_content(article)
- if existing_article.content != content:
- existing_article.content = content
- existing_article.readed = False
- art_contr.update({'entry_id': existing_article.entry_id},
- existing_article.dump())
- logger.info('Article updated: {}'.format(article['link']))
- continue
-
- # insertion of the new article
- try:
- new_articles.append(art_contr.create(**new_article))
- logger.info('New article added: {}'.format(new_article['link']))
- except Exception:
- logger.exception('Error when inserting article in database:')
+async def insert_articles(queue, nḅ_producers=1):
+ """Consumer coroutines.
+ """
+ nb_producers_done = 0
+ while True:
+ item = await queue.get()
+ if item is None:
+ nb_producers_done += 1
+ if nb_producers_done == nḅ_producers:
+ print('All producers done.')
+ print('Process finished.')
+ break
continue
- return new_articles
-
-
-async def init_process(user, feed):
- # Fetch the feed and insert new articles in the database
- try:
- articles = await insert_database(user, feed)
- logger.debug('inserted articles for %s', feed.title)
- return articles
- except Exception as e:
- logger.exception('init_process: ' + str(e))
-
-def retrieve_feed(loop, user, feed_id=None):
+ user, feed, articles = item
+
+
+ if None is articles:
+ logger.info('None')
+ articles = []
+
+ logger.info('Inserting articles for {}'.format(feed.title))
+
+ new_articles = []
+ art_contr = ArticleController(user.id)
+ for article in articles:
+ new_article = await construct_article(article, feed)
+
+ try:
+ existing_article_req = art_contr.read(feed_id=feed.id,
+ entry_id=extract_id(article))
+ except Exception as e:
+ logger.exception("existing_article_req: " + str(e))
+ continue
+ exist = existing_article_req.count() != 0
+ if exist:
+ continue
+ # if the article has been already retrieved, we only update
+ # the content or the title
+ logger.info('Article already in the database: {}'. \
+ format(article['link']))
+ existing_article = existing_article_req.first()
+
+ if new_article['date'].replace(tzinfo=None) != \
+ existing_article.date:
+ existing_article.date = new_article['date']
+ existing_article.updated_date = new_article['date']
+ if existing_article.title != new_article['title']:
+ existing_article.title = new_article['title']
+ content = get_article_content(article)
+ if existing_article.content != content:
+ existing_article.content = content
+ existing_article.readed = False
+ art_contr.update({'entry_id': existing_article.entry_id},
+ existing_article.dump())
+ logger.info('Article updated: {}'.format(article['link']))
+ continue
+
+ # insertion of the new article
+ try:
+ new_articles.append(art_contr.create(**new_article))
+ logger.info('New article added: {}'.format(new_article['link']))
+ except Exception:
+ logger.exception('Error when inserting article in database:')
+ continue
+
+
+async def retrieve_feed(queue, users, feed_id=None):
"""
Launch the processus.
"""
- logger.info('Starting to retrieve feeds for {}'.format(user.nickname))
-
- # Get the list of feeds to fetch
- filters = {}
- filters['user_id'] = user.id
- if feed_id is not None:
- filters['id'] = feed_id
- filters['enabled'] = True
- filters['error_count__lt'] = conf.DEFAULT_MAX_ERROR
- filters['last_retrieved__lt'] = datetime.now() - \
- timedelta(minutes=conf.FEED_REFRESH_INTERVAL)
- feeds = FeedController().read(**filters).all()
-
- if feeds == []:
- logger.info('No feed to retrieve for {}'.format(user.nickname))
- return
-
- # Launch the process for all the feeds
- tasks = [asyncio.ensure_future(init_process(user, feed)) for feed in feeds]
+ for user in users:
+ logger.info('Starting to retrieve feeds for {}'.format(user.nickname))
+ filters = {}
+ filters['user_id'] = user.id
+ if feed_id is not None:
+ filters['id'] = feed_id
+ filters['enabled'] = True
+ filters['error_count__lt'] = conf.DEFAULT_MAX_ERROR
+ filters['last_retrieved__lt'] = datetime.now() - \
+ timedelta(minutes=conf.FEED_REFRESH_INTERVAL)
+ feeds = FeedController().read(**filters).all()
+
+
+ if feeds == []:
+ logger.info('No feed to retrieve for {}'.format(user.nickname))
+
+
+ for feed in feeds:
+ articles = parse_feed(user, feed)
+ await queue.put((user, feed, articles))
+
+ await queue.put(None)
- try:
- loop.run_until_complete(asyncio.wait(tasks))
- except:
- logger.exception('an error occured')
- finally:
- logger.info('Articles retrieved for {}'.format(user.nickname))
bgstack15