diff options
author | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-04-27 02:09:26 +0200 |
---|---|---|
committer | Cédric Bonhomme <cedric@cedricbonhomme.org> | 2014-04-27 02:09:26 +0200 |
commit | 89cf405ab970c4e289b7b79485b27aed8edb1a41 (patch) | |
tree | 7f6330fb7b9bab82a84aeb2a639377b9c4fe1d80 /pyaggr3g470r/utils.py | |
parent | This fixes #3. (diff) | |
download | newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.tar.gz newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.tar.bz2 newspipe-89cf405ab970c4e289b7b79485b27aed8edb1a41.zip |
Cleaned code.
Diffstat (limited to 'pyaggr3g470r/utils.py')
-rwxr-xr-x | pyaggr3g470r/utils.py | 24 |
1 files changed, 17 insertions, 7 deletions
diff --git a/pyaggr3g470r/utils.py b/pyaggr3g470r/utils.py index ce210c20..88a3904a 100755 --- a/pyaggr3g470r/utils.py +++ b/pyaggr3g470r/utils.py @@ -34,12 +34,10 @@ __license__ = "AGPLv3" # - e-mail notifications. # -import os import re import glob import opml import operator -import calendar from urllib import urlencode from urlparse import urlparse, parse_qs, urlunparse from BeautifulSoup import BeautifulSoup @@ -52,7 +50,7 @@ from pyaggr3g470r import db from pyaggr3g470r.models import User, Feed # regular expression to check URL -url_finders = [ \ +url_finders = [ re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?/[-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]*[^]'\\.}>\\),\\\"]"), \ re.compile("([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}|(((news|telnet|nttp|file|http|ftp|https)://)|(www|ftp)[-A-Za-z0-9]*\\.)[-A-Za-z0-9\\.]+)(:[0-9]*)?"), \ re.compile("(~/|/|\\./)([-A-Za-z0-9_\\$\\.\\+\\!\\*\\(\\),;:@&=\\?/~\\#\\%]|\\\\)+"), \ @@ -62,6 +60,7 @@ url_finders = [ \ #import log #pyaggr3g470r_log = log.Log() + @contextmanager def opened_w_error(filename, mode="r"): try: @@ -74,6 +73,7 @@ def opened_w_error(filename, mode="r"): finally: f.close() + def import_opml(email, opml_file): """ Import new feeds from an OPML file. @@ -89,11 +89,11 @@ def import_opml(email, opml_file): Parse recursively through the categories and sub-categories. """ for subscription in subsubscription: - + if len(subscription) != 0: nb = read(subscription, nb) else: - + try: title = subscription.text @@ -118,7 +118,9 @@ def import_opml(email, opml_file): except: site_link = "" - new_feed = Feed(title=title, description=description, link=link, site_link=site_link, email_notification=False, enabled=True) + new_feed = Feed(title=title, description=description, + link=link, site_link=site_link, + email_notification=False, enabled=True) user.feeds.append(new_feed) nb += 1 @@ -128,13 +130,15 @@ def import_opml(email, opml_file): db.session.commit() return nb + def clean_url(url): """ Remove utm_* parameters """ parsed_url = urlparse(url) qd = parse_qs(parsed_url.query, keep_blank_values=True) - filtered = dict((k, v) for k, v in qd.iteritems() if not k.startswith('utm_')) + filtered = dict((k, v) for k, v in qd.iteritems() + if not k.startswith('utm_')) nice_url = urlunparse([ parsed_url.scheme, parsed_url.netloc, @@ -145,6 +149,7 @@ def clean_url(url): ]) return nice_url + def open_url(url): """ Open an URL with the proxy and the user-agent @@ -175,6 +180,7 @@ def open_url(url): #pyaggr3g470r_log.error(url + " " + str(e.reason.errno) + " " + e.reason.strerror) return (False, error) + def clear_string(data): """ Clear a string by removing HTML tags, HTML special caracters @@ -184,6 +190,7 @@ def clear_string(data): q = re.compile('\s') # consecutive white spaces return p.sub('', q.sub(' ', data)) + def load_stop_words(): """ Load the stop words and return them in a list. @@ -199,6 +206,7 @@ def load_stop_words(): stop_words += stop_wods_file.read().split(";") return stop_words + def top_words(articles, n=10, size=5): """ Return the n most frequent words in a list. @@ -213,6 +221,7 @@ def top_words(articles, n=10, size=5): words[word] += 1 return words.most_common(n) + def tag_cloud(tags): """ Generates a tags cloud. @@ -222,6 +231,7 @@ def tag_cloud(tags): (min(1 + count * 7 / max([tag[1] for tag in tags]), 7), word, format(count, ',d'), word)) \ for (word, count) in tags]) + def search_feed(url): """ Search a feed in a HTML page. |