diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2015-07-06 15:26:29 +0200 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2015-07-06 15:26:29 +0200 |
commit | a867c1243c80843f3736ee260b92d5b13ec510ec (patch) | |
tree | 18f4d724abb04d9a14ad7dd5c0b1aca8eb98b3ca | |
parent | Merged in jaesivsm/pyaggr3g470r (pull request #16) (diff) | |
download | newspipe-a867c1243c80843f3736ee260b92d5b13ec510ec.tar.gz newspipe-a867c1243c80843f3736ee260b92d5b13ec510ec.tar.bz2 newspipe-a867c1243c80843f3736ee260b92d5b13ec510ec.zip |
Minor fixes from a quick review. Need to test deeper.
-rwxr-xr-x | manager.py | 36 | ||||
-rw-r--r-- | pyaggr3g470r/crawler.py | 6 | ||||
-rw-r--r-- | pyaggr3g470r/lib/article_utils.py | 26 |
3 files changed, 25 insertions, 43 deletions
@@ -41,28 +41,30 @@ def fetch_asyncio(user_id, feed_id): with application.app_context(): populate_g() + from flask import g from pyaggr3g470r.models import User from pyaggr3g470r import crawler - users, feed_id = [], None - try: - users = User.query.filter(User.id == int(user_id)).all() - except: - users = User.query.all() - finally: - if users == []: + users, feed_id = [], None + try: + users = User.query.filter(User.id == int(user_id)).all() + except: users = User.query.all() + finally: + if users == []: + users = User.query.all() - try: - feed_id = int(feed_id) - except: - feed_id = None + try: + feed_id = int(feed_id) + except: + feed_id = None - loop = asyncio.get_event_loop() - for user in users: - if user.activation_key == "": - print("Fetching articles for " + user.nickname) - feed_getter = crawler.retrieve_feed(loop, user, feed_id) - loop.close() + loop = asyncio.get_event_loop() + for user in users: + if user.activation_key == "": + print("Fetching articles for " + user.nickname) + g.user = user + feed_getter = crawler.retrieve_feed(loop, g.user, feed_id) + loop.close() if __name__ == '__main__': manager.run() diff --git a/pyaggr3g470r/crawler.py b/pyaggr3g470r/crawler.py index b70b4e70..4ebca1a3 100644 --- a/pyaggr3g470r/crawler.py +++ b/pyaggr3g470r/crawler.py @@ -64,7 +64,6 @@ def get(*args, **kwargs): data = feedparser.parse(args[0]) return data except Exception as e: - #print(e) raise e @asyncio.coroutine @@ -118,7 +117,8 @@ def insert_database(user, feed): new_articles = [] art_contr = ArticleController(user.id) for article in articles: - exist = art_contr.read(feed_id=feed.id, **extract_id(article)) + exist = art_contr.read(feed_id=feed.id, + **extract_id(article)).count() != 0 if exist: logger.debug("Article %r (%r) already in the database.", article.title, article.link) @@ -128,7 +128,7 @@ def insert_database(user, feed): new_articles.append(art_contr.create(**article)) logger.info("New article % (%r) added.", article.title, article.link) - except Exception: + except Exception as e: logger.exception("Error when inserting article in database:") continue return new_articles diff --git a/pyaggr3g470r/lib/article_utils.py b/pyaggr3g470r/lib/article_utils.py index 023be9a7..3c642167 100644 --- a/pyaggr3g470r/lib/article_utils.py +++ b/pyaggr3g470r/lib/article_utils.py @@ -52,25 +52,6 @@ def construct_article(entry, feed): elif entry.get('summary'): content = entry['summary'] - description = entry.get('description', '') - try: - description = entry.content[0].value - except Exception: - pass - - try: - soup = BeautifulSoup(description, "lxml") - # Prevents BeautifulSoup4 from adding extra <html><body> tags - # to the soup with the lxml parser. - if soup.html.body: - description = soup.html.body.decode_contents() - elif soup.html: - description = soup.html.decode_contents() - else: - description = soup.decode() - except Exception: - pass - article_link = entry.get('link') if conf.RESOLVE_ARTICLE_URL and article_link: try: @@ -82,13 +63,12 @@ def construct_article(entry, feed): logger.warning("Unable to get the real URL of %s. Error: %s", article_link, error) - return {'feed_id': feed['id'], - 'user_id': feed['user_id'], + return {'feed_id': feed.id, + 'user_id': feed.user_id, 'entry_id': extract_id(entry).get('entry_id', None), - 'link': entry.get('link', feed['site_link']), + 'link': entry.get('link', feed.site_link), 'title': entry.get('title', 'No title'), 'readed': False, 'like': False, - 'description': description, 'content': content, 'retrieved_date': now.isoformat(), 'date': (date or now).isoformat()} |