From 13547a54dc2515aeb414683c769913d95564403b Mon Sep 17 00:00:00 2001
From: Cédric Bonhomme <kimble.mandel@gmail.com>
Date: Thu, 20 Dec 2012 23:31:42 +0100
Subject: Added getwords(html) function.

---
 source/utils.py | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/source/utils.py b/source/utils.py
index d9b21169..14c0096f 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,6 +83,16 @@ def detect_url_errors(list_of_urls):
             errors.append((url, e.reason.errno ,e.reason.strerror))
     return errors
 
+def getwords(html):
+    # Remove all the HTML tags
+    txt=re.compile(r'<[^>]+>').sub('',html)
+
+    # Split words by all non-alpha characters
+    words=re.compile(r'[^A-Z^a-z]+').split(txt)
+
+    # Convert to lowercase
+    return [word.lower() for word in words if word!='']
+
 def clear_string(data):
     """
     Clear a string by removing HTML tags, HTML special caracters
-- 
cgit