From 3744c4836bf52149823c05f455e3190bf1af583c Mon Sep 17 00:00:00 2001 From: cedricbonhomme Date: Mon, 19 Mar 2012 23:07:37 +0100 Subject: Added script to convert the SQLite database to a MongoDB database for the new version of pyAggr3g470r. --- mongodb.py | 22 ++++++---------- sqlite2mongo.py | 82 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+), 14 deletions(-) create mode 100644 sqlite2mongo.py diff --git a/mongodb.py b/mongodb.py index 0eb4e002..a37b80c0 100644 --- a/mongodb.py +++ b/mongodb.py @@ -27,9 +27,8 @@ class Articles(object): """ Creates a new collection for a new feed. """ - #pymongo.collection.Collection(self.db, new_collection["feed_id"]) collection = self.db[new_collection["feed_id"]] - collection.create_index([("article_link", pymongo.ASCENDING)], {"unique":True, "sparse":True}) + #collection.create_index([("feed_link", pymongo.ASCENDING)], {"unique":True, "sparse":True}) collection.insert(new_collection) def add_articles(self, articles, feed_id): @@ -37,6 +36,10 @@ class Articles(object): Add article(s) in a collection. """ collection = self.db[str(feed_id)] + + collection.create_index([("article_link", pymongo.ASCENDING), ("article_date", pymongo.DESCENDING)], \ + {"unique":False, "sparse":False}) + for article in articles: cursor = collection.find({"article_id":article["article_id"]}) if cursor.count() == 0: @@ -102,7 +105,8 @@ class Articles(object): cursor = collection.find({"type":1}) else: cursor = collection.find({"type":1, condition[0]:condition[1]}) - return cursor.sort([("article_date", pymongo.DESCENDING)]) + #return cursor.sort([("article_date", pymongo.DESCENDING)]) + return cursor def print_articles_from_collection(self, collection_id): """ @@ -245,16 +249,6 @@ if __name__ == "__main__": #print articles.get_all_articles() - - - for feed in articles.get_all_collections(): - for article in articles.get_articles_from_collection(feed["feed_id"]): - try: - #print article["article_title"], article["article_date"] - pass - except: - pass - # Drop the database - #articles.drop_database() \ No newline at end of file + articles.drop_database() \ No newline at end of file diff --git a/sqlite2mongo.py b/sqlite2mongo.py new file mode 100644 index 00000000..b129ebce --- /dev/null +++ b/sqlite2mongo.py @@ -0,0 +1,82 @@ +#! /usr/bin/env python +# -*- coding: utf-8 -*- + +import hashlib + +import sqlite3 +import mongodb + +SQLITE_BASE = "./var/feed.db" + + +def load_feed(): + """ + Load feeds and articles in a dictionary. + """ + mongo = mongodb.Articles() + list_of_feeds = [] + list_of_articles = [] + + try: + conn = sqlite3.connect(SQLITE_BASE, isolation_level = None) + c = conn.cursor() + list_of_feeds = c.execute("SELECT * FROM feeds").fetchall() + except: + pass + + + if list_of_feeds != []: + # Walk through the list of feeds + for feed in list_of_feeds: + try: + list_of_articles = c.execute(\ + "SELECT * FROM articles WHERE feed_link='" + \ + feed[2] + "'").fetchall() + except: + continue + sha1_hash = hashlib.sha1() + sha1_hash.update(feed[2].encode('utf-8')) + feed_id = sha1_hash.hexdigest() + + + new_collection = {"feed_id" : feed_id.encode('utf-8'), \ + "type": 0, \ + "feed_image" : feed[3].encode('utf-8'), \ + "feed_title" : feed[0].encode('utf-8'), \ + "feed_link" : feed[2].encode('utf-8'), \ + "site_link" : feed[1].encode('utf-8'), \ + "mail" : feed[4]=="1"} + + + mongo.add_collection(new_collection) + + + if list_of_articles != []: + # Walk through the list of articles for the current feed. + articles = [] + for article in list_of_articles: + sha1_hash = hashlib.sha1() + sha1_hash.update(article[2].encode('utf-8')) + article_id = sha1_hash.hexdigest() + + + article = {"article_id": article_id.encode('utf-8'), \ + "type":1, \ + "article_date": article[0].encode('utf-8'), \ + "article_link": article[2].encode('utf-8'), \ + "article_title": article[1].encode('utf-8'), \ + "article_content": article[3].encode('utf-8'), \ + "article_readed": article[4]=="1", \ + "article_like": article[6]=="1" \ + } + + articles.append(article) + + + mongo.add_articles(articles, feed_id) + + c.close() + + +if __name__ == "__main__": + load_feed() \ No newline at end of file -- cgit