light refact

* both crawlers now use the same utils methods * the original crawler use now more of the controllers, enabling the filters feature
author: François Schmidts <francois.schmidts@gmail.com> 2015-07-06 10:28:58 +0200
committer: François Schmidts <francois.schmidts@gmail.com> 2015-07-06 11:10:33 +0200
commit: 75df52051b167425adcfb68797f77fcbcad33c4e (patch)
tree: 810b6e887c767f35f11e0412a404352390f7cacc /pyaggr3g470r/lib/crawler.py
parent: making bookmaklet work for any url (diff)
download: newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.gz
newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.bz2
newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.zip
1 files changed, 6 insertions, 62 deletions
diff --git a/pyaggr3g470r/lib/crawler.py b/pyaggr3g470r/lib/crawler.py
index 45b1acde..8d2de15f 100644
--- a/pyaggr3g470r/lib/crawler.py
+++ b/pyaggr3g470r/lib/crawler.py
@@ -17,48 +17,19 @@ import conf
 import json
 import logging
 import feedparser
-import dateutil.parser
-from hashlib import md5
 from functools import wraps
-from datetime import datetime
 from time import strftime, gmtime
 from concurrent.futures import ThreadPoolExecutor
 from requests_futures.sessions import FuturesSession
-from pyaggr3g470r.lib.utils import default_handler, construct_feed_from
+from pyaggr3g470r.lib.utils import default_handler, to_hash
+from pyaggr3g470r.lib.feed_utils import construct_feed_from
+from pyaggr3g470r.lib.article_utils import extract_id, construct_article
 
 logger = logging.getLogger(__name__)
 logging.captureWarnings(True)
 API_ROOT = "api/v2.0/"
 
 
-def to_hash(text):
-    return md5(text.encode('utf8')).hexdigest()
-
-
-def extract_id(entry, keys=[('link', 'link'),
-                            ('published', 'retrieved_date'),
-                            ('updated', 'retrieved_date')], force_id=False):
-    """For a given entry will return a dict that allows to identify it. The
-    dict will be constructed on the uid of the entry. if that identifier is
-    absent, the dict will be constructed upon the values of "keys".
-    """
-    entry_id = entry.get('entry_id') or entry.get('id')
-    if entry_id:
-        return {'entry_id': entry_id}
-    if not entry_id and force_id:
-        entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
-                                   if entry_key in entry).encode('utf8'))
-    else:
-        ids = {}
-        for entry_key, pyagg_key in keys:
-            if entry_key in entry and pyagg_key not in ids:
-                ids[pyagg_key] = entry[entry_key]
-                if 'date' in pyagg_key:
-                    ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\
-                                                    .isoformat()
-        return ids
-
-
 class AbstractCrawler:
     __session__ = None
     __counter__ = 0
@@ -139,34 +110,6 @@ class PyAggUpdater(AbstractCrawler):
         self.parsed_feed = parsed_feed
         super(PyAggUpdater, self).__init__(auth)
 
-    def to_article(self, entry):
-        "Safe method to transorm a feedparser entry into an article"
-        date = datetime.now()
-
-        for date_key in ('published', 'updated'):
-            if entry.get(date_key):
-                try:
-                    date = dateutil.parser.parse(entry[date_key])
-                except Exception:
-                    pass
-                else:
-                    break
-        content = ''
-        if entry.get('content'):
-            content = entry['content'][0]['value']
-        elif entry.get('summary'):
-            content = entry['summary']
-
-        return {'feed_id': self.feed['id'],
-                'user_id': self.feed['user_id'],
-                'entry_id': extract_id(entry).get('entry_id', None),
-                'link': entry.get('link', self.feed['site_link']),
-                'title': entry.get('title', 'No title'),
-                'readed': False, 'like': False,
-                'content': content,
-                'retrieved_date': date.isoformat(),
-                'date': date.isoformat()}
-
     @AbstractCrawler.count_on_me
     def callback(self, response):
         """Will process the result from the challenge, creating missing article
@@ -176,8 +119,9 @@ class PyAggUpdater(AbstractCrawler):
         logger.debug('%r %r - %d entries were not matched and will be created',
                      self.feed['id'], self.feed['title'], len(results))
         for id_to_create in results:
-            entry = self.to_article(
-                    self.entries[tuple(sorted(id_to_create.items()))])
+            entry = construct_article(
+                    self.entries[tuple(sorted(id_to_create.items()))],
+                    self.feed)
             logger.warn('%r %r - creating %r for %r - %r', self.feed['id'],
                         self.feed['title'], entry['title'], entry['user_id'],
                         id_to_create)
author	François Schmidts <francois.schmidts@gmail.com>	2015-07-06 10:28:58 +0200
committer	François Schmidts <francois.schmidts@gmail.com>	2015-07-06 11:10:33 +0200
commit	75df52051b167425adcfb68797f77fcbcad33c4e (patch)
tree	810b6e887c767f35f11e0412a404352390f7cacc /pyaggr3g470r/lib/crawler.py
parent	making bookmaklet work for any url (diff)
download	newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.gz newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.tar.bz2 newspipe-75df52051b167425adcfb68797f77fcbcad33c4e.zip