aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCédric Bonhomme <kimble.mandel@gmail.com>2012-12-20 23:31:42 +0100
committerCédric Bonhomme <kimble.mandel@gmail.com>2012-12-20 23:31:42 +0100
commit13547a54dc2515aeb414683c769913d95564403b (patch)
tree6edd0ce0a5237fa9ba118781ccedc10ff2d4972b
parentTypo. (diff)
downloadnewspipe-13547a54dc2515aeb414683c769913d95564403b.tar.gz
newspipe-13547a54dc2515aeb414683c769913d95564403b.tar.bz2
newspipe-13547a54dc2515aeb414683c769913d95564403b.zip
Added getwords(html) function.
-rwxr-xr-xsource/utils.py10
1 files changed, 10 insertions, 0 deletions
diff --git a/source/utils.py b/source/utils.py
index d9b21169..14c0096f 100755
--- a/source/utils.py
+++ b/source/utils.py
@@ -83,6 +83,16 @@ def detect_url_errors(list_of_urls):
errors.append((url, e.reason.errno ,e.reason.strerror))
return errors
+def getwords(html):
+ # Remove all the HTML tags
+ txt=re.compile(r'<[^>]+>').sub('',html)
+
+ # Split words by all non-alpha characters
+ words=re.compile(r'[^A-Z^a-z]+').split(txt)
+
+ # Convert to lowercase
+ return [word.lower() for word in words if word!='']
+
def clear_string(data):
"""
Clear a string by removing HTML tags, HTML special caracters
bgstack15