From ba14debbde6cde5d77f5d57e67d4bef341042857 Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Tue, 1 May 2012 14:10:25 +0200 Subject: urlsafe_b64encode is replaced by SHA1 for id of articles. --- source/feedgetter.py | 9 +++++++-- source/sqlite2mongo.py | 9 +++++++-- source/utils.py | 14 -------------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/source/feedgetter.py b/source/feedgetter.py index aa25f2a3..59322e6a 100755 --- a/source/feedgetter.py +++ b/source/feedgetter.py @@ -26,6 +26,7 @@ __revision__ = "$Date: 2012/04/22 $" __copyright__ = "Copyright (c) Cedric Bonhomme" __license__ = "GPLv3" +import hashlib import threading import feedparser from BeautifulSoup import BeautifulSoup @@ -96,7 +97,9 @@ class FeedGetter(object): except: feed_image = "/img/feed-icon-28x28.png" - feed_id = utils.uri_b64encode(feed_link.encode('utf-8')) + sha1_hash = hashlib.sha1() + sha1_hash.update(feed_link.encode('utf-8')) + feed_id = sha1_hash.hexdigest() collection_dic = {"feed_id": feed_id, \ "type": 0, \ @@ -129,7 +132,9 @@ class FeedGetter(object): except: post_date = datetime(*article.published_parsed[:6]) - article_id = utils.uri_b64encode(article.link.encode('utf-8')) + sha1_hash = hashlib.sha1() + sha1_hash.update(article.link.encode('utf-8')) + article_id = sha1_hash.hexdigest() article = {"article_id": article_id, \ "type":1, \ diff --git a/source/sqlite2mongo.py b/source/sqlite2mongo.py index c4bb4e17..ecb0ec7f 100644 --- a/source/sqlite2mongo.py +++ b/source/sqlite2mongo.py @@ -1,6 +1,7 @@ #! /usr/bin/env python # -*- coding: utf-8 -*- +import hashlib import sqlite3 import mongodb @@ -34,7 +35,9 @@ def sqlite2mongo(): feed[2] + "'").fetchall() except: continue - feed_id = utils.uri_b64encode(feed[2].encode('utf-8')) + sha1_hash = hashlib.sha1() + sha1_hash.update(feed[2].encode('utf-8')) + feed_id = sha1_hash.hexdigest() new_collection = {"feed_id" : feed_id.encode('utf-8'), \ "type": 0, \ @@ -50,7 +53,9 @@ def sqlite2mongo(): # Walk through the list of articles for the current feed. articles = [] for article in list_of_articles: - article_id = utils.uri_b64encode(article[2].encode('utf-8')) + sha1_hash = hashlib.sha1() + sha1_hash.update(article[2].encode('utf-8')) + article_id = sha1_hash.hexdigest() article = {"article_id": article_id.encode('utf-8'), \ "type":1, \ diff --git a/source/utils.py b/source/utils.py index d1d2c684..da68550b 100755 --- a/source/utils.py +++ b/source/utils.py @@ -64,20 +64,6 @@ url_finders = [ \ re.compile("'\\<((mailto:)|)[-A-Za-z0-9\\.]+@[-A-Za-z0-9\\.]+"), \ ] -from base64 import urlsafe_b64encode, urlsafe_b64decode - -def uri_b64encode(s): - """ - Encode an URI in base 64 and remove the final '='. - """ - return urlsafe_b64encode(s).strip('=') - -def uri_b64decode(s): - """ - Decode a base 64 encoded URI. - """ - return urlsafe_b64decode(s + '=' * (4 - len(s) % 4)) - def detect_url_errors(list_of_urls): """ Detect URL errors. -- cgit