#! /usr/local/bin/python
#-*- coding: utf-8 -*-

__author__ = "Cedric Bonhomme"
__version__ = "$Revision: 0.2 $"
__date__ = "$Date: 2010/03/07 $"
__copyright__ = "Copyright (c) 2010 Cedric Bonhomme"
__license__ = "GPLv3"

IMPORT_ERROR = []

import re
try:
    import pylab
except:
    IMPORT_ERROR.append("pylab")
import sqlite3
import hashlib

from datetime import datetime
from string import punctuation
from collections import defaultdict

from StringIO import StringIO

try:
    from oice.langdet import langdet
    from oice.langdet import streams
    from oice.langdet import languages
except:
    IMPORT_ERROR.append("oice")

def detect_language(text):
    """
    Detect the language of a text.
    English, French or other (not detected).
    """
    text = text.strip()
    try:
        text_stream = streams.Stream(StringIO(text))
        lang = langdet.LanguageDetector.detect(text_stream)
    except:
        return 'other'
    if lang == languages.french:
        return 'french'.encode('utf-8')
    elif lang == languages.english:
        return 'english'.encode('utf-8')
    else:
        return 'other'

def remove_html_tags(data):
    """
    Remove HTML tags for the search.
    """
    p = re.compile(r'<[^<]*?/?>')
    return p.sub('', data)

def top_words(dic_articles, n=10):
    """
    Return the n most frequent words in a list.
    """
    words = {}
    articles_content = ""
    for rss_feed_id in dic_articles.keys():
        for article in dic_articles[rss_feed_id]:
            articles_content += remove_html_tags(article[4].encode('utf-8'))
    words_gen = (word.strip(punctuation).lower() \
                        for word in articles_content.split() \
                                if len(word) >= 5)
    words = defaultdict(int)
    for word in words_gen:
        words[word] += 1
    top_words = sorted(words.iteritems(),
                key=lambda(word, count): (-count, word))[:n]
    return top_words

def create_histogram(words, file_name="./var/histogram.png"):
    """
    Create a histogram.
    """
    length = 10
    ind = pylab.arange(length) # abscissa
    width = 0.35 # bars width

    w = [elem[0] for elem in words]
    count = [int(elem[1]) for elem in words]

    max_count = max(count)  # maximal weight

    p = pylab.bar(ind, count, width, color='r')

    pylab.ylabel("Count")
    pylab.title("Most frequent words")
    pylab.xticks(ind + (width / 2), range(1, len(w)+1))
    pylab.xlim(-width, len(ind))

    # changing the ordinate scale according to the max.
    if max_count <= 100:
        pylab.ylim(0, max_count + 5)
        pylab.yticks(pylab.arange(0, max_count + 5, 5))
    elif max_count <= 200:
        pylab.ylim(0, max_count + 10)
        pylab.yticks(pylab.arange(0, max_count + 10, 10))
    elif max_count <= 600:
        pylab.ylim(0, max_count + 25)
        pylab.yticks(pylab.arange(0, max_count + 25, 25))
    elif max_count <= 800:
        pylab.ylim(0, max_count + 50)
        pylab.yticks(pylab.arange(0, max_count + 50, 50))

    pylab.savefig(file_name, dpi = 80)
    pylab.close()

def compare(stringtime1, stringtime2):
    """
    Compare two dates in the format 'yyyy-mm-dd hh:mm:ss'.
    """
    date1, time1 = stringtime1.split(' ')
    date2, time2 = stringtime2.split(' ')

    year1, month1, day1 = date1.split('-')
    year2, month2, day2 = date2.split('-')

    hour1, minute1, second1 = time1.split(':')
    hour2, minute2, second2 = time2.split(':')

    datetime1 = datetime(year=int(year1), month=int(month1), day=int(day1), \
                        hour=int(hour1), minute=int(minute1), second=int(second1))

    datetime2 = datetime(year=int(year2), month=int(month2), day=int(day2), \
                        hour=int(hour2), minute=int(minute2), second=int(second2))

    if datetime1 < datetime2:
        return -1
    elif datetime1 > datetime2:
        return 1
    return 0

def load_feed():
    """
    Load feeds and articles in a dictionary.
    """
    list_of_feeds = None
    list_of_articles = None
    try:
        conn = sqlite3.connect("./var/feed.db", isolation_level = None)
        c = conn.cursor()
        list_of_feeds = c.execute("SELECT * FROM feeds").fetchall()
    except:
        pass

    # articles[feed_id] = (article_id, article_date, article_title,
    #               article_link, article_description, article_readed,
    #               article_language)
    # feeds[feed_id] = (nb_article, nb_article_unreaded, feed_image,
    #               feed_title, feed_link, feed_site_link)
    articles, feeds = {}, {}
    if list_of_feeds != []:
        for feed in list_of_feeds:
            list_of_articles = c.execute(\
                    "SELECT * FROM articles WHERE feed_link='" + \
                    feed[2] + "'").fetchall()

            if list_of_articles != []:
                for article in list_of_articles:
                    sha1_hash = hashlib.sha1()
                    sha1_hash.update(article[5].encode('utf-8'))
                    feed_id = sha1_hash.hexdigest()
                    sha1_hash.update(article[2].encode('utf-8'))
                    article_id = sha1_hash.hexdigest()

                    if "oice" not in IMPORT_ERROR:
                        if article[3] != "":
                            language = detect_language(remove_html_tags(article[3][:80]).encode('utf-8') + \
                                                remove_html_tags(article[1]).encode('utf-8'))
                        else:
                            language = detect_language(remove_html_tags(article[1]).encode('utf-8'))
                    else:
                        language = "IMPORT_ERROR"

                    article_list = [article_id, article[0], article[1], \
                        article[2], article[3], article[4], language]

                    if feed_id not in articles:
                        articles[feed_id] = [article_list]
                    else:
                        articles[feed_id].append(article_list)


                # sort articles by date for each feeds
                for rss_feed_id in articles.keys():
                    articles[rss_feed_id].sort(lambda x,y: compare(y[1], x[1]))

                feeds[feed_id] = (len(articles[feed_id]), \
                                len([article for article in articles[feed_id] \
                                    if article[5]=="0"]), \
                                feed[3], feed[0], feed[2], feed[1] \
                                )
        c.close()

        return (articles, feeds)
    return (articles, feeds)