From f3e68af10d4960df8ca59de3efb0ae2f53522f12 Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Wed, 28 Apr 2010 08:24:41 +0200 Subject: New regular expression to remove special characters (for instance ’). --- utils.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'utils.py') diff --git a/utils.py b/utils.py index 3b8b376a..fc945f34 100755 --- a/utils.py +++ b/utils.py @@ -74,7 +74,8 @@ def remove_html_tags(data): Remove HTML tags for the search. """ p = re.compile(r'<[^<]*?/?>') - return p.sub('', data) + q = re.compile(r'&#[0-9]+;') + return p.sub('', q.sub('', data)) def top_words(dic_articles, n=10, size=5): """ -- cgit