From a10ddebe50e1650d8699fe5c9f08184337ba4dd7 Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <kimble.mandel@gmail.com>
Date: Thu, 20 Dec 2012 23:40:29 +0100
Subject: Optimization of the regular expression used by clear_string in
 utils.py.

---
 source/utils.py | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/source/utils.py b/source/utils.py
index 58adcfad..a2f3dfb6 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,22 +83,12 @@ def detect_url_errors(list_of_urls):
             errors.append((url, e.reason.errno ,e.reason.strerror))
     return errors
 
-def get_words(html):
-    # Remove all the HTML tags
-    txt=re.compile(r'<[^>]+>').sub('',html)
-
-    # Split words by all non-alpha characters
-    words=re.compile(r'[^A-Z^a-z]+').split(txt)
-
-    # Convert to lowercase
-    return [word.lower() for word in words if word!='']
-
 def clear_string(data):
     """
     Clear a string by removing HTML tags, HTML special caracters
     and consecutive white spaces (more that one).
     """
-    p = re.compile(b'<[^<]*?/?>') # HTML tags
+    p = re.compile(b'<[^>]+>') # HTML tags
     q = re.compile(b'\s') # consecutive white spaces
     return p.sub(b'', q.sub(b' ', bytes(data, "utf-8"))).decode("utf-8", "strict")
 
@@ -170,8 +160,7 @@ def top_words(articles, n=10, size=5):
     words = Counter()
     wordre = re.compile(r'\b\w{%s,}\b' % size, re.I)
     for article in articles:
-        #for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
-        for word in [elem.lower() for elem in get_words(article["article_content"]) if elem.lower() not in stop_words]:
+        for word in [elem.lower() for elem in wordre.findall(clear_string(article["article_content"])) if elem.lower() not in stop_words]:
             words[word] += 1
     return words.most_common(n)
 
-- 
cgit