Optimization of the regular expression used by clear_string in utils.py.

author: Cédric Bonhomme <kimble.mandel@gmail.com> 2012-12-20 23:40:29 +0100
committer: Cédric Bonhomme <kimble.mandel@gmail.com> 2012-12-20 23:40:29 +0100
commit: a10ddebe50e1650d8699fe5c9f08184337ba4dd7 (patch)
tree: 1a47cb4053e36b634c29313da220deb5770f17f1
parent: test get_words for the generation of the tag cloud. (diff)
download: newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.tar.gz
newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.tar.bz2
newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.zip
1 files changed, 2 insertions, 13 deletions
diff --git a/source/utils.py b/source/utils.py
index 58adcfad..a2f3dfb6 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,22 +83,12 @@ def detect_url_errors(list_of_urls):
             errors.append((url, e.reason.errno ,e.reason.strerror))
     return errors
 
-def get_words(html):
-    # Remove all the HTML tags
-    txt=re.compile(r'<[^>]+>').sub('',html)
-
-    # Split words by all non-alpha characters
-    words=re.compile(r'[^A-Z^a-z]+').split(txt)
-
-    # Convert to lowercase
-    return [word.lower() for word in words if word!='']
-
 def clear_string(data):
     """
     Clear a string by removing HTML tags, HTML special caracters
     and consecutive white spaces (more that one).
     """
-    p = re.compile(b'<[^<]*?/?>') # HTML tags
+    p = re.compile(b'<[^>]+>') # HTML tags
     q = re.compile(b'\s') # consecutive white spaces
     return p.sub(b'', q.sub(b' ', bytes(data, "utf-8"))).decode("utf-8", "strict")
 
@@ -170,8 +160,7 @@ def top_words(articles, n=10, size=5):
     words = Counter()
     wordre = re.compile(r'\b\w{%s,}\b' % size, re.I)
     for article in articles:
-        #for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
-        for word in [elem.lower() for elem in get_words(article["article_content"]) if elem.lower() not in stop_words]:
+        for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
             words[word] += 1
     return words.most_common(n)
author	Cédric Bonhomme <kimble.mandel@gmail.com>	2012-12-20 23:40:29 +0100
committer	Cédric Bonhomme <kimble.mandel@gmail.com>	2012-12-20 23:40:29 +0100
commit	a10ddebe50e1650d8699fe5c9f08184337ba4dd7 (patch)
tree	1a47cb4053e36b634c29313da220deb5770f17f1
parent	test get_words for the generation of the tag cloud. (diff)
download	newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.tar.gz newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.tar.bz2 newspipe-a10ddebe50e1650d8699fe5c9f08184337ba4dd7.zip