aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xsource/utils.py10
1 files changed, 10 insertions, 0 deletions
diff --git a/source/utils.py b/source/utils.py
index d9b21169..14c0096f 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,6 +83,16 @@ def detect_url_errors(list_of_urls):
errors.append((url, e.reason.errno ,e.reason.strerror))
return errors
+def getwords(html):
+ # Remove all the HTML tags
+ txt=re.compile(r'<[^>]+>').sub('',html)
+
+ # Split words by all non-alpha characters
+ words=re.compile(r'[^A-Z^a-z]+').split(txt)
+
+ # Convert to lowercase
+ return [word.lower() for word in words if word!='']
+
def clear_string(data):
"""
Clear a string by removing HTML tags, HTML special caracters
bgstack15