aboutsummaryrefslogtreecommitdiff
path: root/pyaggr3g470r/lib/article_utils.py
blob: 023be9a7c554bf3061e08d11912e048f10117ff7 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import logging
import requests
import dateutil.parser
from datetime import datetime
from bs4 import BeautifulSoup

import conf
from pyaggr3g470r.lib.utils import to_hash

logger = logging.getLogger(__name__)


def extract_id(entry, keys=[('link', 'link'),
                            ('published', 'retrieved_date'),
                            ('updated', 'retrieved_date')], force_id=False):
    """For a given entry will return a dict that allows to identify it. The
    dict will be constructed on the uid of the entry. if that identifier is
    absent, the dict will be constructed upon the values of "keys".
    """
    entry_id = entry.get('entry_id') or entry.get('id')
    if entry_id:
        return {'entry_id': entry_id}
    if not entry_id and force_id:
        entry_id = to_hash("".join(entry[entry_key] for _, entry_key in keys
                                   if entry_key in entry).encode('utf8'))
    else:
        ids = {}
        for entry_key, pyagg_key in keys:
            if entry_key in entry and pyagg_key not in ids:
                ids[pyagg_key] = entry[entry_key]
                if 'date' in pyagg_key:
                    ids[pyagg_key] = dateutil.parser.parse(ids[pyagg_key])\
                                                    .isoformat()
        return ids


def construct_article(entry, feed):
    "Safe method to transorm a feedparser entry into an article"
    now = datetime.now()

    for date_key in ('published', 'updated'):
        if entry.get(date_key):
            try:
                date = dateutil.parser.parse(entry[date_key])
            except Exception:
                pass
            else:
                break
    content = ''
    if entry.get('content'):
        content = entry['content'][0]['value']
    elif entry.get('summary'):
        content = entry['summary']

    description = entry.get('description', '')
    try:
        description = entry.content[0].value
    except Exception:
        pass

    try:
        soup = BeautifulSoup(description, "lxml")
        # Prevents BeautifulSoup4 from adding extra <html><body> tags
        # to the soup with the lxml parser.
        if soup.html.body:
            description = soup.html.body.decode_contents()
        elif soup.html:
            description = soup.html.decode_contents()
        else:
            description = soup.decode()
    except Exception:
        pass

    article_link = entry.get('link')
    if conf.RESOLVE_ARTICLE_URL and article_link:
        try:
            # resolves URL behind proxies
            # (like feedproxy.google.com)
            response = requests.get(article_link, verify=False, timeout=5.0)
            article_link = response.url
        except Exception as error:
            logger.warning("Unable to get the real URL of %s. Error: %s",
                           article_link, error)

    return {'feed_id': feed['id'],
            'user_id': feed['user_id'],
            'entry_id': extract_id(entry).get('entry_id', None),
            'link': entry.get('link', feed['site_link']),
            'title': entry.get('title', 'No title'),
            'readed': False, 'like': False,
            'description': description,
            'content': content,
            'retrieved_date': now.isoformat(),
            'date': (date or now).isoformat()}
bgstack15