aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCédric Bonhomme <cedric@cedricbonhomme.org>2015-07-06 15:26:29 +0200
committerCédric Bonhomme <cedric@cedricbonhomme.org>2015-07-06 15:26:29 +0200
commita867c1243c80843f3736ee260b92d5b13ec510ec (patch)
tree18f4d724abb04d9a14ad7dd5c0b1aca8eb98b3ca
parentMerged in jaesivsm/pyaggr3g470r (pull request #16) (diff)
downloadnewspipe-a867c1243c80843f3736ee260b92d5b13ec510ec.tar.gz
newspipe-a867c1243c80843f3736ee260b92d5b13ec510ec.tar.bz2
newspipe-a867c1243c80843f3736ee260b92d5b13ec510ec.zip
Minor fixes from a quick review. Need to test deeper.
-rwxr-xr-xmanager.py36
-rw-r--r--pyaggr3g470r/crawler.py6
-rw-r--r--pyaggr3g470r/lib/article_utils.py26
3 files changed, 25 insertions, 43 deletions
diff --git a/manager.py b/manager.py
index a4743895..1c038dd6 100755
--- a/manager.py
+++ b/manager.py
@@ -41,28 +41,30 @@ def fetch_asyncio(user_id, feed_id):
with application.app_context():
populate_g()
+ from flask import g
from pyaggr3g470r.models import User
from pyaggr3g470r import crawler
- users, feed_id = [], None
- try:
- users = User.query.filter(User.id == int(user_id)).all()
- except:
- users = User.query.all()
- finally:
- if users == []:
+ users, feed_id = [], None
+ try:
+ users = User.query.filter(User.id == int(user_id)).all()
+ except:
users = User.query.all()
+ finally:
+ if users == []:
+ users = User.query.all()
- try:
- feed_id = int(feed_id)
- except:
- feed_id = None
+ try:
+ feed_id = int(feed_id)
+ except:
+ feed_id = None
- loop = asyncio.get_event_loop()
- for user in users:
- if user.activation_key == "":
- print("Fetching articles for " + user.nickname)
- feed_getter = crawler.retrieve_feed(loop, user, feed_id)
- loop.close()
+ loop = asyncio.get_event_loop()
+ for user in users:
+ if user.activation_key == "":
+ print("Fetching articles for " + user.nickname)
+ g.user = user
+ feed_getter = crawler.retrieve_feed(loop, g.user, feed_id)
+ loop.close()
if __name__ == '__main__':
manager.run()
diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py
index b70b4e70..4ebca1a3 100644
--- a/pyaggr3g470r/crawler.py
+++ b/pyaggr3g470r/crawler.py
@@ -64,7 +64,6 @@ def get(*args, **kwargs):
data = feedparser.parse(args[0])
return data
except Exception as e:
- #print(e)
raise e
@asyncio.coroutine
@@ -118,7 +117,8 @@ def insert_database(user, feed):
new_articles = []
art_contr = ArticleController(user.id)
for article in articles:
- exist = art_contr.read(feed_id=feed.id, **extract_id(article))
+ exist = art_contr.read(feed_id=feed.id,
+ **extract_id(article)).count() != 0
if exist:
logger.debug("Article %r (%r) already in the database.",
article.title, article.link)
@@ -128,7 +128,7 @@ def insert_database(user, feed):
new_articles.append(art_contr.create(**article))
logger.info("New article % (%r) added.",
article.title, article.link)
- except Exception:
+ except Exception as e:
logger.exception("Error when inserting article in database:")
continue
return new_articles
diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py
index 023be9a7..3c642167 100644
--- a/pyaggr3g470r/lib/article_utils.py
+++ b/pyaggr3g470r/lib/article_utils.py
@@ -52,25 +52,6 @@ def construct_article(entry, feed):
elif entry.get('summary'):
content = entry['summary']
- description = entry.get('description', '')
- try:
- description = entry.content[0].value
- except Exception:
- pass
-
- try:
- soup = BeautifulSoup(description, "lxml")
- # Prevents BeautifulSoup4 from adding extra <html><body> tags
- # to the soup with the lxml parser.
- if soup.html.body:
- description = soup.html.body.decode_contents()
- elif soup.html:
- description = soup.html.decode_contents()
- else:
- description = soup.decode()
- except Exception:
- pass
-
article_link = entry.get('link')
if conf.RESOLVE_ARTICLE_URL and article_link:
try:
@@ -82,13 +63,12 @@ def construct_article(entry, feed):
logger.warning("Unable to get the real URL of %s. Error: %s",
article_link, error)
- return {'feed_id': feed['id'],
- 'user_id': feed['user_id'],
+ return {'feed_id': feed.id,
+ 'user_id': feed.user_id,
'entry_id': extract_id(entry).get('entry_id', None),
- 'link': entry.get('link', feed['site_link']),
+ 'link': entry.get('link', feed.site_link),
'title': entry.get('title', 'No title'),
'readed': False, 'like': False,
- 'description': description,
'content': content,
'retrieved_date': now.isoformat(),
'date': (date or now).isoformat()}
bgstack15