Browse Source

Clarifications in queries_2 in tests

kora 1 year ago
parent
commit
e47ee8d48c
1 changed files with 100 additions and 102 deletions
  1. 100 102
      test_suite/tests_kora_misc/Query_speed/queries_2.py

+ 100 - 102
test_suite/tests_kora_misc/Query_speed/queries_2.py

@@ -1,10 +1,9 @@
 # %%
-import json
 import sqlite3
 import pandas as pd
 import time
 
-from decoding.decoding import getVettSpec, db_results_decode_pandas, db_results_decode, db_results_decode_nodict, db_encode, isColumnToDecode
+from decoding.decoding import getVettSpec, db_results_decode, db_encode, isColumnToDecode
 
 # Rifattorizzazione della routine 'findtext', per recuperare i contesti dal DB di Gatto4.
 # Per semplificare, la ricerca principale è limitata alla prima table di occorrenze -- Occ00001
@@ -73,6 +72,11 @@ def refactor1(LIST1, LIST2):
     return results
 
 # %%
+###########
+# PROVA 1
+###########
+
+# Cooccorrenze: ricerca di 'c*'
 str0 = 'c%'
 strEnc0 = db_encode(vettSpec, str0)
 
@@ -98,7 +102,7 @@ timestamp2 = time.time()
 
 print(timestamp2-timestamp0)
 # %%
-# Prova una cooccorrenza
+# Ricerca della seconda parola
 str1 = 'prima'
 strEnc1 = db_encode(vettSpec, str1)
 
@@ -155,80 +159,15 @@ with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
 
 timestamp1 = time.time()
 print(timestamp1 - timestamp0)
-# %%
-# Better decoding?
-print('The first one:', chr(int(vettSpec[0]['unicode'], 16)) )
-print('... is a newline')
-
-for index, entry in enumerate(vettSpec):
-    try:
-        print(index+1, chr(int(entry['intcode'])), chr(int(entry['unicode'], 16)))
-    except Exception as err:
-        print(index+1, err)
-        if(index>0):
-            break
 
-print('Total:', len(vettSpec))
 # %%
-timestamp0 = time.time()
-vettDictDec = {}
-vettDictEnc = {}
-for index, entry in enumerate(vettSpec):
-    try:
-        vettDictDec[chr(int(entry['intcode']))] = chr(int(entry['unicode'], 16))
-        vettDictEnc[chr(int(entry['unicode'], 16))] = chr(int(entry['intcode']))
-    except Exception as err:
-        print(index+1, err)
-        if(index>0):
-            break
+###########
+# PROVA 2
+###########
 
-timestamp1 = time.time()
-print(timestamp1 - timestamp0)
-# %%
-timestamp0 = time.time()
-str0 = 'c%'
-strEnc0 = db_encode(vettSpec, str0)
+# The find bib search -- va runnata a seguito della Prova 1 perché usa la variabile 'res'
 
-queryString0 = firstQueryL(strEnc0)
-
-with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
-    connection.row_factory = dict_factory
-    queryReponse = connection.cursor().execute(queryString0)
-    lemsNoPandas0 = queryReponse.fetchall()
-db_results_decode(lemsNoPandas0, vettSpec)
-
-timestamp1 = time.time()
-print(timestamp1 - timestamp0)
-# %%
-def db_decodeB(string):
-    return ''.join([vettDictDec[char] for char in string])
-
-def db_results_decodeB(result):
-    keysToTest = [key for key in result[0].keys() if isColumnToDecode(key)]
-    for row in result:
-        for key in keysToTest:
-            row[key] = db_decodeB(row[key])
-    return result
-# %%
-timestamp0 = time.time()
-str0 = 'c%'
-strEnc0 = db_encode(vettSpec, str0)
-
-queryString0 = firstQueryL(strEnc0)
-
-with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
-    connection.row_factory = dict_factory
-    queryReponse = connection.cursor().execute(queryString0)
-    lemsNoPandas0 = queryReponse.fetchall()
-
-db_results_decodeB(lemsNoPandas0)
-
-timestamp1 = time.time()
-print(timestamp1 - timestamp0)
-# IT WORKZ!
-# %%
-# FINALLY:
-# The bib search
+# A. Ridefinisco la stringa di ricerca
 def reducedQueryString(queryData):
 
     type = queryData['queryType']
@@ -261,31 +200,10 @@ def reducedQueryString(queryData):
         return f"SELECT head AS Rif_organico, full AS Rif_completo FROM org WHERE (indice='{numorg}' AND ntx='{ntx}')"
     
     return ""
-
 # %%
-timestamp0 = time.time()
-
-infobib = pd.DataFrame()
-rif_org = pd.DataFrame()
-for ind, row in res.iterrows():
-    queryData = {'queryType': 'bib', 'row': row}
-    queryStringBib = reducedQueryString(queryData)
-    dbFileBib='bibliografia/BiblioTLIO.db'
-    with sqlite3.connect(f"file:{dbFileBib}?mode=ro", uri=True) as connection:
-        bib = pd.read_sql(queryStringBib, connection)
-    infobib = pd.concat([infobib, bib])
-
-    queryData = {'queryType': 'rif', 'row': row}
-    queryStringRif = reducedQueryString(queryData)
-    with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
-        rif = pd.read_sql(queryStringRif, connection)
-    rif_org = pd.concat([rif_org, rif])
-
 
+# B. La versione vecchia ha una velocità non soddisfacente -- non la ripeto qui. Ne provo direttamente una nuova!
 
-timestamp1 = time.time()
-print(timestamp1 - timestamp0)
-# %%
 timestamp0 = time.time()
 
 siglaList = list(res['sigla'])
@@ -301,6 +219,7 @@ bib = bib.set_index('Sigla')
 timestamp1 = time.time()
 
 
+# Provo anche due diverse versioni di riassegnazione-Pandas dei risultati!
 annoiniz = [bib.loc[sigla, 'Anno iniziale'] for sigla in siglaList]
 annofin = [bib.loc[sigla, 'Anno finale'] for sigla in siglaList]
 datacod = [bib.loc[sigla, 'Data codificata'] for sigla in siglaList]
@@ -317,14 +236,11 @@ tipo = [bib.loc[sigla, 'Tipo'] for sigla in siglaList]
 iq = [bib.loc[sigla, 'IQ'] for sigla in siglaList]
 
 timestamp2 = time.time()
-print(timestamp1 - timestamp0)
-print(timestamp2 - timestamp0)
-# %%
-timestamp0 = time.time()
 
-aglia = {sigla: bib.loc[sigla].to_dict() for sigla in siglaSet}
 
-out = [aglia[sigla] for sigla in siglaList]
+part = {sigla: bib.loc[sigla].to_dict() for sigla in siglaSet}
+
+out = [part[sigla] for sigla in siglaList]
 
 annoiniz = [el['Anno iniziale'] for el in out]
 annofin = [el['Anno finale'] for el in out]
@@ -342,8 +258,90 @@ tipo = [el['Tipo'] for el in out]
 iq = [el['IQ'] for el in out]
 
 
+timestamp3 = time.time()
+print(timestamp1 - timestamp0)
+print(timestamp2 - timestamp0)
+print(timestamp3 - timestamp0)
+# %%
+###########
+# PROVA 3
+###########
+
+# Better decoding! -- Questa è indipendente dalle Prove 2 e 3 -- ha bisogno solo del run della prima cella.
+
+# A. Un warm-up: un'occhiata ai caratteri in VettSpec
+print('The first one:', chr(int(vettSpec[0]['unicode'], 16)) )
+print('... is a newline')
+
+for index, entry in enumerate(vettSpec):
+    try:
+        print(index+1, chr(int(entry['intcode'])), chr(int(entry['unicode'], 16)))
+    except Exception as err:
+        print(index+1, err)
+        if(index>0):
+            break
+
+print('Total:', len(vettSpec))
+# %%
+# B. Riefinisco l'encoding/decoding usando 2 lookups dedicati
+timestamp0 = time.time()
+vettDictDec = {}
+vettDictEnc = {}
+for index, entry in enumerate(vettSpec):
+    try:
+        vettDictDec[chr(int(entry['intcode']))] = chr(int(entry['unicode'], 16))
+        vettDictEnc[chr(int(entry['unicode'], 16))] = chr(int(entry['intcode']))
+    except Exception as err:
+        print(index+1, err)
+        if(index>0):
+            break
+
+def db_decodeB(string):
+    return ''.join([vettDictDec[char] for char in string])
+
+def db_results_decodeB(result):
+    keysToTest = [key for key in result[0].keys() if isColumnToDecode(key)]
+    for row in result:
+        for key in keysToTest:
+            row[key] = db_decodeB(row[key])
+    return result
+
+
+timestamp1 = time.time()
+print(timestamp1 - timestamp0)
+# %%
+# Rilancio la ricerca di 'c*' paragonando i decoding
+timestamp0 = time.time()
+str0 = 'c%'
+strEnc0 = db_encode(vettSpec, str0)
+
+queryString0 = firstQueryL(strEnc0)
+
+with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
+    connection.row_factory = dict_factory
+    queryReponse = connection.cursor().execute(queryString0)
+    lemsNoPandas0 = queryReponse.fetchall()
+db_results_decode(lemsNoPandas0, vettSpec)
+
 timestamp1 = time.time()
 print(timestamp1 - timestamp0)
 # %%
-[el['Anno iniziale'] for el in out]
+timestamp0 = time.time()
+str0 = 'c%'
+strEnc0 = db_encode(vettSpec, str0)
+
+queryString0 = firstQueryL(strEnc0)
+
+with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
+    connection.row_factory = dict_factory
+    queryReponse = connection.cursor().execute(queryString0)
+    lemsNoPandas0 = queryReponse.fetchall()
+
+db_results_decodeB(lemsNoPandas0)
+
+timestamp1 = time.time()
+print(timestamp1 - timestamp0)
+# IT WORKZ!
+
+print(lemsNoPandas0)
 # %%