aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--pyaggr3g470r/crawler.py164
-rw-r--r--pyaggr3g470r/lib/article_utils.py94
-rw-r--r--pyaggr3g470r/lib/crawler.py68
-rw-r--r--pyaggr3g470r/lib/feed_utils.py89
-rw-r--r--pyaggr3g470r/lib/utils.py83
-rw-r--r--pyaggr3g470r/views/feed.py3
6 files changed, 225 insertions, 276 deletions
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index 23f9026e..b70b4e70 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -27,19 +27,18 @@ __copyright__ = "Copyright (c) Cedric Bonhomme"
__license__ = "AGPLv3"
import asyncio
-import aiohttp
import logging
-import requests
import feedparser
import dateutil.parser
from datetime import datetime
-from bs4 import BeautifulSoup
from sqlalchemy import or_
-from pyaggr3g470r import utils
import conf
from bootstrap import db
-from pyaggr3g470r.models import User, Article
+from pyaggr3g470r.models import User
+from pyaggr3g470r.controllers import FeedController, ArticleController
+from pyaggr3g470r.lib.feed_utils import construct_feed_from
+from pyaggr3g470r.lib.article_utils import construct_article, extract_id
logger = logging.getLogger(__name__)
@@ -85,122 +84,26 @@ def parse_feed(user, feed):
db.session.commit()
return
- #a_feed = feedparser.parse(data)
+ up_feed = {}
if a_feed['bozo'] == 1:
- #logger.error(a_feed['bozo_exception'])
- feed.last_error = str(a_feed['bozo_exception'])
- feed.error_count += 1
+ up_feed['last_error'] = str(a_feed['bozo_exception'])
+ up_feed['error_count'] = feed.error_count + 1
db.session.commit()
if a_feed['entries'] == []:
return
- feed.last_retrieved = datetime.now(dateutil.tz.tzlocal())
- feed.error_count = 0
- feed.last_error = ""
+ up_feed['last_retrieved'] = datetime.now(dateutil.tz.tzlocal())
+ up_feed['error_count'] = 0
+ up_feed['last_error'] = ""
# Feed informations
- try:
- feed.title = a_feed.feed.title
- except:
- feed.title = "No title"
- if feed.link == "":
- try:
- feed.link = a_feed.feed.link
- except:
- feed.link = ""
- try:
- feed.description = a_feed.feed.subtitle
- except:
- feed.description = ""
- try:
-
- feed.icon = [a_feed.feed.get('image', False) and
- a_feed.feed.image.get('href', "") or a_feed.feed.get('icon', "")][0]
- except:
- feed.icon = ""
-
- db.session.commit()
-
- articles = []
- for article in a_feed['entries']:
-
- try:
- nice_url = article.link
- except:
- # if not able to get the link of the article, continue
- continue
- if conf.RESOLVE_ARTICLE_URL:
- try:
- # resolves URL behind proxies
- # (like feedproxy.google.com)
- r = requests.get(article.link, timeout=5.0)
- nice_url = r.url
- except Exception as error:
- logger.warning(
- "Unable to get the real URL of %s. Error: %s",
- article.link, error)
- pass
- # remove utm_* parameters
- nice_url = utils.clean_url(nice_url)
-
- try:
- entry_id = article.id
- except:
- entry_id = nice_url
-
- description = ""
- article_title = article.get('title', '')
- try:
- # article content
- description = article.content[0].value
- except AttributeError:
- # article description
- description = article.get('description', '')
-
- try:
- soup = BeautifulSoup(description, "lxml")
-
- # Prevents BeautifulSoup4 from adding extra <html><body> tags
- # to the soup with the lxml parser.
- if soup.html.body:
- description = soup.html.body.decode_contents()
- elif soup.html:
- description = soup.html.decode_contents()
- else:
- description = soup.decode()
- except:
- logger.error("Problem when sanitizing the content of the article %s (%s)",
- article_title, nice_url)
-
- # Get the date of publication of the article
- post_date = None
- for date_key in ('published_parsed', 'published',
- 'updated_parsed', 'updated'):
- if not date_key in article:
- continue
+ up_feed.update(construct_feed_from(feed.link, a_feed))
+ if feed.title and 'title' in up_feed:
+ del up_feed['title']
+ FeedController().update({'id': feed.id}, up_feed)
- try:
- post_date = dateutil.parser.parse(article[date_key],
- dayfirst=True)
- break
- except:
- try: # trying to clean date field from letters
- post_date = dateutil.parser.parse(
- re.sub('[A-z]', '', article[date_key]),
- dayfirst=True)
- break
- except:
- pass
- else:
- post_date = datetime.now(dateutil.tz.tzlocal())
+ return a_feed['entries']
- # create the models.Article object and append it to the list of articles
- article = Article(entry_id=entry_id, link=nice_url, title=article_title,
- content=description, readed=False, like=False,
- date=post_date, user_id=user.id,
- feed_id=feed.id)
- articles.append(article)
- return articles
@asyncio.coroutine
def insert_database(user, feed):
@@ -209,34 +112,32 @@ def insert_database(user, feed):
if None is articles:
return []
- #print('inserting articles for {}'.format(feed.title))
+ logger.debug('inserting articles for {}'.format(feed.title))
logger.info("Database insertion...")
new_articles = []
- query1 = Article.query.filter(Article.user_id == user.id)
- query2 = query1.filter(Article.feed_id == feed.id)
+ art_contr = ArticleController(user.id)
for article in articles:
- exist = query2.filter(or_(Article.entry_id==article.entry_id, Article.link==article.link)).count() != 0
+ exist = art_contr.read(feed_id=feed.id, **extract_id(article))
if exist:
- #logger.debug("Article %r (%r) already in the database.", article.title, article.link)
+ logger.debug("Article %r (%r) already in the database.",
+ article.title, article.link)
continue
- new_articles.append(article)
+ article = construct_article(article, feed)
try:
- feed.articles.append(article)
- #db.session.merge(article)
- db.session.commit()
- #logger.info("New article % (%r) added.", article.title, article.link)
- except Exception as e:
- logger.error("Error when inserting article in database: " + str(e))
+ new_articles.append(art_contr.create(**article))
+ logger.info("New article % (%r) added.",
+ article.title, article.link)
+ except Exception:
+ logger.exception("Error when inserting article in database:")
continue
- #db.session.close()
return new_articles
@asyncio.coroutine
def init_process(user, feed):
# Fetch the feed and insert new articles in the database
articles = yield from asyncio.async(insert_database(user, feed))
- #print('inserted articles for {}'.format(feed.title))
+ logger.debug('inserted articles for %s', feed.title)
return articles
def retrieve_feed(loop, user, feed_id=None):
@@ -248,24 +149,23 @@ def retrieve_feed(loop, user, feed_id=None):
# Get the list of feeds to fetch
user = User.query.filter(User.email == user.email).first()
feeds = [feed for feed in user.feeds if
- feed.error_count <= conf.DEFAULT_MAX_ERROR and \
- feed.enabled]
+ feed.error_count <= conf.DEFAULT_MAX_ERROR and feed.enabled]
if feed_id is not None:
feeds = [feed for feed in feeds if feed.id == feed_id]
if feeds == []:
return
- import time
# Launch the process for all the feeds
tasks = []
try:
# Python 3.5 (test)
- tasks = [asyncio.ensure_future(init_process(user, feed)) for feed in feeds]
+ tasks = [asyncio.ensure_future(init_process(user, feed))
+ for feed in feeds]
except:
tasks = [init_process(user, feed) for feed in feeds]
try:
loop.run_until_complete(asyncio.wait(tasks))
- except Exception as e:
- print(e)
+ except Exception:
+ logger.exception('an error occured')
logger.info("All articles retrieved. End of the processus.")
diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py
new file mode 100644
index 00000000..023be9a7
--- /dev/null
+++ b/pyaggr3g470r/lib/article_utils.py
@@ -0,0 +1,94 @@
+import logging
+import requests
+import dateutil.parser
+from datetime import datetime
+from bs4 import BeautifulSoup
+
+import conf
+from pyaggr3g470r.lib.utils import to_hash
+
+logger = logging.getLogger(__name__)
+
+
+def extract_id(entry, keys=[('link', 'link'),
+ ('published', 'retrieved_date'),
+ ('updated', 'retrieved_date')], force_id=False):
+ """For a given entry will return a dict that allows to identify it. The
+ dict will be constructed on the uid of the entry. if that identifier is
+ absent, the dict will be constructed upon the values of "keys".
+ """
+ entry_id = entry.get('entry_id') or entry.get('id')
+ if entry_id:
+ return {'entry_id': entry_id}
+ if not entry_id and force_id:
+ entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
+ if entry_key in entry).encode('utf8'))
+ else:
+ ids = {}
+ for entry_key, pyagg_key in keys:
+ if entry_key in entry and pyagg_key not in ids:
+ ids[pyagg_key] = entry[entry_key]
+ if 'date' in pyagg_key:
+ ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\
+ .isoformat()
+ return ids
+
+
+def construct_article(entry, feed):
+ "Safe method to transorm a feedparser entry into an article"
+ now = datetime.now()
+
+ for date_key in ('published', 'updated'):
+ if entry.get(date_key):
+ try:
+ date = dateutil.parser.parse(entry[date_key])
+ except Exception:
+ pass
+ else:
+ break
+ content = ''
+ if entry.get('content'):
+ content = entry['content'][0]['value']
+ elif entry.get('summary'):
+ content = entry['summary']
+
+ description = entry.get('description', '')
+ try:
+ description = entry.content[0].value
+ except Exception:
+ pass
+
+ try:
+ soup = BeautifulSoup(description, "lxml")
+ # Prevents BeautifulSoup4 from adding extra <html><body> tags
+ # to the soup with the lxml parser.
+ if soup.html.body:
+ description = soup.html.body.decode_contents()
+ elif soup.html:
+ description = soup.html.decode_contents()
+ else:
+ description = soup.decode()
+ except Exception:
+ pass
+
+ article_link = entry.get('link')
+ if conf.RESOLVE_ARTICLE_URL and article_link:
+ try:
+ # resolves URL behind proxies
+ # (like feedproxy.google.com)
+ response = requests.get(article_link, verify=False, timeout=5.0)
+ article_link = response.url
+ except Exception as error:
+ logger.warning("Unable to get the real URL of %s. Error: %s",
+ article_link, error)
+
+ return {'feed_id': feed['id'],
+ 'user_id': feed['user_id'],
+ 'entry_id': extract_id(entry).get('entry_id', None),
+ 'link': entry.get('link', feed['site_link']),
+ 'title': entry.get('title', 'No title'),
+ 'readed': False, 'like': False,
+ 'description': description,
+ 'content': content,
+ 'retrieved_date': now.isoformat(),
+ 'date': (date or now).isoformat()}
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 45b1acde..8d2de15f 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -17,48 +17,19 @@ import conf
import json
import logging
import feedparser
-import dateutil.parser
-from hashlib import md5
from functools import wraps
-from datetime import datetime
from time import strftime, gmtime
from concurrent.futures import ThreadPoolExecutor
from requests_futures.sessions import FuturesSession
-from pyaggr3g470r.lib.utils import default_handler, construct_feed_from
+from pyaggr3g470r.lib.utils import default_handler, to_hash
+from pyaggr3g470r.lib.feed_utils import construct_feed_from
+from pyaggr3g470r.lib.article_utils import extract_id, construct_article
logger = logging.getLogger(__name__)
logging.captureWarnings(True)
API_ROOT = "api/v2.0/"
-def to_hash(text):
- return md5(text.encode('utf8')).hexdigest()
-
-
-def extract_id(entry, keys=[('link', 'link'),
- ('published', 'retrieved_date'),
- ('updated', 'retrieved_date')], force_id=False):
- """For a given entry will return a dict that allows to identify it. The
- dict will be constructed on the uid of the entry. if that identifier is
- absent, the dict will be constructed upon the values of "keys".
- """
- entry_id = entry.get('entry_id') or entry.get('id')
- if entry_id:
- return {'entry_id': entry_id}
- if not entry_id and force_id:
- entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
- if entry_key in entry).encode('utf8'))
- else:
- ids = {}
- for entry_key, pyagg_key in keys:
- if entry_key in entry and pyagg_key not in ids:
- ids[pyagg_key] = entry[entry_key]
- if 'date' in pyagg_key:
- ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\
- .isoformat()
- return ids
-
-
class AbstractCrawler:
__session__ = None
__counter__ = 0
@@ -139,34 +110,6 @@ class PyAggUpdater(AbstractCrawler):
self.parsed_feed = parsed_feed
super(PyAggUpdater, self).__init__(auth)
- def to_article(self, entry):
- "Safe method to transorm a feedparser entry into an article"
- date = datetime.now()
-
- for date_key in ('published', 'updated'):
- if entry.get(date_key):
- try:
- date = dateutil.parser.parse(entry[date_key])
- except Exception:
- pass
- else:
- break
- content = ''
- if entry.get('content'):
- content = entry['content'][0]['value']
- elif entry.get('summary'):
- content = entry['summary']
-
- return {'feed_id': self.feed['id'],
- 'user_id': self.feed['user_id'],
- 'entry_id': extract_id(entry).get('entry_id', None),
- 'link': entry.get('link', self.feed['site_link']),
- 'title': entry.get('title', 'No title'),
- 'readed': False, 'like': False,
- 'content': content,
- 'retrieved_date': date.isoformat(),
- 'date': date.isoformat()}
-
@AbstractCrawler.count_on_me
def callback(self, response):
"""Will process the result from the challenge, creating missing article
@@ -176,8 +119,9 @@ class PyAggUpdater(AbstractCrawler):
logger.debug('%r %r - %d entries were not matched and will be created',
self.feed['id'], self.feed['title'], len(results))
for id_to_create in results:
- entry = self.to_article(
- self.entries[tuple(sorted(id_to_create.items()))])
+ entry = construct_article(
+ self.entries[tuple(sorted(id_to_create.items()))],
+ self.feed)
logger.warn('%r %r - creating %r for %r - %r', self.feed['id'],
self.feed['title'], entry['title'], entry['user_id'],
id_to_create)
diff --git a/pyaggr3g470r/lib/feed_utils.py b/pyaggr3g470r/lib/feed_utils.py
new file mode 100644
index 00000000..a7149d79
--- /dev/null
+++ b/pyaggr3g470r/lib/feed_utils.py
@@ -0,0 +1,89 @@
+import urllib
+import logging
+import requests
+import feedparser
+from bs4 import BeautifulSoup, SoupStrainer
+
+from pyaggr3g470r.lib.utils import try_keys, try_splits, rebuild_url
+
+logger = logging.getLogger(__name__)
+
+
+def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
+ if url is None and fp_parsed is not None:
+ url = fp_parsed.get('url')
+ if url is not None and fp_parsed is None:
+ try:
+ response = requests.get(url, verify=False)
+ fp_parsed = feedparser.parse(response.content,
+ request_headers=response.headers)
+ except Exception:
+ logger.exception('failed to retreive that url')
+ fp_parsed = {'bozo': True}
+ assert url is not None and fp_parsed is not None
+ feed = feed or {}
+ feed_split = urllib.parse.urlsplit(url)
+ if not fp_parsed['bozo']:
+ feed['link'] = url
+ feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
+ feed['title'] = fp_parsed['feed'].get('title')
+ feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
+ feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
+ else:
+ feed['site_link'] = url
+
+ if feed.get('site_link'):
+ feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
+ site_split = urllib.parse.urlsplit(feed['site_link'])
+
+ if feed.get('icon'):
+ feed['icon'] = try_splits(feed['icon'], site_split, feed_split)
+ if feed['icon'] is None:
+ del feed['icon']
+
+ if not feed.get('site_link') or not query_site \
+ or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
+ return feed
+
+ response = requests.get(feed['site_link'], verify=False)
+ bs_parsed = BeautifulSoup(response.content, 'html.parser',
+ parse_only=SoupStrainer('head'))
+
+ if not feed.get('title'):
+ try:
+ feed['title'] = bs_parsed.find_all('title')[0].text
+ except Exception:
+ pass
+
+ def check_keys(**kwargs):
+ def wrapper(elem):
+ for key, vals in kwargs.items():
+ if not elem.has_attr(key):
+ return False
+ if not all(val in elem.attrs[key] for val in vals):
+ return False
+ return True
+ return wrapper
+
+ if not feed.get('icon'):
+ icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
+ if not len(icons):
+ icons = bs_parsed.find_all(check_keys(rel=['icon']))
+ if len(icons) >= 1:
+ for icon in icons:
+ feed['icon'] = try_splits(icon.attrs['href'],
+ site_split, feed_split)
+ if feed['icon'] is not None:
+ break
+
+ if feed.get('icon') is None:
+ feed['icon'] = try_splits('/favicon.ico', site_split, feed_split)
+ if 'icon' in feed and feed['icon'] is None:
+ del feed['icon']
+
+ if not feed.get('link'):
+ alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
+ type=['application/rss+xml']))
+ if len(alternate) >= 1:
+ feed['link'] = alternate[0].attrs['href']
+ return feed
diff --git a/pyaggr3g470r/lib/utils.py b/pyaggr3g470r/lib/utils.py
index a4f2e043..b7e5cafc 100644
--- a/pyaggr3g470r/lib/utils.py
+++ b/pyaggr3g470r/lib/utils.py
@@ -2,8 +2,7 @@ import types
import urllib
import logging
import requests
-import feedparser
-from bs4 import BeautifulSoup, SoupStrainer
+from hashlib import md5
logger = logging.getLogger(__name__)
@@ -47,81 +46,5 @@ def try_splits(url, *splits):
return None
-def construct_feed_from(url=None, fp_parsed=None, feed=None, query_site=True):
- if url is None and fp_parsed is not None:
- url = fp_parsed.get('url')
- if url is not None and fp_parsed is None:
- try:
- response = requests.get(url, verify=False)
- fp_parsed = feedparser.parse(response.content,
- request_headers=response.headers)
- except Exception:
- logger.exception('failed to retreive that url')
- fp_parsed = {'bozo': True}
- assert url is not None and fp_parsed is not None
- feed = feed or {}
- feed_split = urllib.parse.urlsplit(url)
- if not fp_parsed['bozo']:
- feed['link'] = url
- feed['site_link'] = try_keys(fp_parsed['feed'], 'href', 'link')
- feed['title'] = fp_parsed['feed'].get('title')
- feed['description'] = try_keys(fp_parsed['feed'], 'subtitle', 'title')
- feed['icon'] = try_keys(fp_parsed['feed'], 'icon')
- else:
- feed['site_link'] = url
-
- if feed.get('site_link'):
- feed['site_link'] = rebuild_url(feed['site_link'], feed_split)
- site_split = urllib.parse.urlsplit(feed['site_link'])
-
- if feed.get('icon'):
- feed['icon'] = try_splits(feed['icon'], site_split, feed_split)
- if feed['icon'] is None:
- del feed['icon']
-
- if not feed.get('site_link') or not query_site \
- or all(bool(feed.get(key)) for key in ('link', 'title', 'icon')):
- return feed
-
- response = requests.get(feed['site_link'], verify=False)
- bs_parsed = BeautifulSoup(response.content, 'html.parser',
- parse_only=SoupStrainer('head'))
-
- if not feed.get('title'):
- try:
- feed['title'] = bs_parsed.find_all('title')[0].text
- except Exception:
- pass
-
- def check_keys(**kwargs):
- def wrapper(elem):
- for key, vals in kwargs.items():
- if not elem.has_attr(key):
- return False
- if not all(val in elem.attrs[key] for val in vals):
- return False
- return True
- return wrapper
-
- if not feed.get('icon'):
- icons = bs_parsed.find_all(check_keys(rel=['icon', 'shortcut']))
- if not len(icons):
- icons = bs_parsed.find_all(check_keys(rel=['icon']))
- if len(icons) >= 1:
- for icon in icons:
- feed['icon'] = try_splits(icon.attrs['href'],
- site_split, feed_split)
- if feed['icon'] is not None:
- break
-
- if feed.get('icon') is None:
- feed['icon'] = try_splits('/favicon.ico', site_split, feed_split)
- if 'icon' in feed and feed['icon'] is None:
- del feed['icon']
-
- if not feed.get('link'):
- alternate = bs_parsed.find_all(check_keys(rel=['alternate'],
- type=['application/rss+xml']))
- if len(alternate) >= 1:
- feed['link'] = alternate[0].attrs['href']
- return feed
+def to_hash(text):
+ return md5(text.encode('utf8')).hexdigest()
diff --git a/pyaggr3g470r/views/feed.py b/pyaggr3g470r/views/feed.py
index 224e27fb..4c848b0e 100644
--- a/pyaggr3g470r/views/feed.py
+++ b/pyaggr3g470r/views/feed.py
@@ -1,6 +1,5 @@
#! /usr/bin/env python
# -*- coding: utf-8 -
-import logging
from datetime import datetime
from sqlalchemy import desc
from werkzeug.exceptions import BadRequest
@@ -12,7 +11,7 @@ from flask.ext.login import login_required
import conf
from pyaggr3g470r import utils
-from pyaggr3g470r.lib.utils import construct_feed_from
+from pyaggr3g470r.lib.feed.utils import construct_feed_from
from pyaggr3g470r.forms import AddFeedForm
from pyaggr3g470r.controllers import FeedController, ArticleController
bgstack15