From a6bc5bf8d7d003b6cf4e623485b330e1e2830703 Mon Sep 17 00:00:00 2001 From: Cédric Bonhomme Date: Mon, 3 Feb 2014 19:38:50 +0100 Subject: Added naive benchmarks for Whoosh and ElasticSearch. --- benchmark/__init__.py | 1 + benchmark/testelasticsearch.py | 58 +++++++++++++++++++++++++++++++++ benchmark/testwhoosh.py | 50 +++++++++++++++++++++++++++++ runbenchmark.py | 73 ++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 182 insertions(+) create mode 100644 benchmark/__init__.py create mode 100644 benchmark/testelasticsearch.py create mode 100644 benchmark/testwhoosh.py create mode 100644 runbenchmark.py diff --git a/benchmark/__init__.py b/benchmark/__init__.py new file mode 100644 index 00000000..8d1c8b69 --- /dev/null +++ b/benchmark/__init__.py @@ -0,0 +1 @@ + diff --git a/benchmark/testelasticsearch.py b/benchmark/testelasticsearch.py new file mode 100644 index 00000000..7a5d1d5b --- /dev/null +++ b/benchmark/testelasticsearch.py @@ -0,0 +1,58 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + +import elasticsearch +from elasticsearch import client + +from pyaggr3g470r import utils + +# Connect to Elasticsearch node specified in the configuration file: +es = elasticsearch.Elasticsearch(hosts={"127.0.0.1" : 9200}) + +def delete_index(): + """ + Deletes all indexes. + """ + es = elasticsearch.Elasticsearch(hosts={"127.0.0.1" : 9200}) + ic = client.IndicesClient(es.indices.client) + try: + ic.delete("") + except: + pass + +def create_index(articles): + """ + Creates the index. + """ + for article in articles: + res = es.index( + index="pyaggr3g470r", + doc_type="text", + id=str(article.id), + body={ + "title": article.title, + "content": utils.clear_string(article.content) + } + ) + return True + +def search(term): + """ + Search a term. + """ + try: + return es.search(index="pyaggr3g470r", body= + {"query" : { + "filtered" : { + "query" : { + "query_string" : { + "default_field" : "content", + "query" : term + } + } + } + } + }, size=5000) + except elasticsearch.exceptions.NotFoundError as e: + logger.warning(str(e)) + return None \ No newline at end of file diff --git a/benchmark/testwhoosh.py b/benchmark/testwhoosh.py new file mode 100644 index 00000000..b488dcd6 --- /dev/null +++ b/benchmark/testwhoosh.py @@ -0,0 +1,50 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + + +import os + +from whoosh.index import create_in, open_dir +from whoosh.index import EmptyIndexError +from whoosh.fields import * +from whoosh.query import * +from whoosh.qparser import QueryParser + +from pyaggr3g470r import utils + +indexdir = "./pyaggr3g470r/var/indexdir" + +schema = Schema(title=TEXT(stored=True), content=TEXT) + +def create_index(articles): + """ + Creates the index. + """ + ix = create_in(indexdir, schema) + writer = ix.writer() + for article in articles: + writer.add_document(content=utils.clear_string(article.content)) + writer.commit() + +def search(term): + """ + Search for `term` in the index. + Returns a list of articles. + """ + try: + ix = open_dir(indexdir) + except (EmptyIndexError, OSError) as e: + raise EmptyIndexError + with ix.searcher() as searcher: + query = QueryParser("content", ix.schema).parse(term) + results = searcher.search(query, limit=None) + #return [(article["feed_id"], article["article_id"]) for article in results] + + +if __name__ == "__main__": + # Point of entry in execution mode. + #create_index() + print(nb_documents()) + results = search("Nothomb") + for article in results: + print(article) diff --git a/runbenchmark.py b/runbenchmark.py new file mode 100644 index 00000000..8241e52b --- /dev/null +++ b/runbenchmark.py @@ -0,0 +1,73 @@ +#! /usr/bin/env python +#-*- coding: utf-8 -*- + + +import time +from benchmark import testelasticsearch +from benchmark import testwhoosh + +import conf +from pyaggr3g470r import models +models.connect(conf.DATABASE_NAME) + +articles = models.Article.objects() + + + +# +# Index generation +# + +print "Indexes generation..." +# Whoosh +print "Whoosh" +begin = time.time() +testwhoosh.create_index(articles) +end = time.time() +print end - begin + +print + +# ElasticSearch +print "ElasticSearch" +testelasticsearch.delete_index() +begin = time.time() +testelasticsearch.create_index(articles) +end = time.time() +print end - begin + + + +print +print +print + + + +# +# Search +# +print "Search..." +for query in ["Edward Snowden", "Saint-Pierre-et-Miquelon", "micropatronage"]: + print "Query:", query + + # Whoosh + print "with Whoosh" + for _ in range(5): + begin = time.time() + testwhoosh.search(query) + end = time.time() + print end - begin + + print + + # ElasticSearch + print "with ElasticSearch" + for _ in range(5): + begin = time.time() + testelasticsearch.search(query) + end = time.time() + print end - begin + + print + print \ No newline at end of file -- cgit