# %% import json import sqlite3 import pandas as pd import time from decoding.decoding import getVettSpec, db_results_decode_pandas, db_results_decode, db_results_decode_nodict # %% # # Ricerca: copiata dalla ricerca per cooccorrenze, versione 18 Maggio 2023 Develop, con prima prima 'c*' come Lemma (con forme non lemmatizzate, il default) -- un sacco di risultati! -- seguita da un findtext automatico per TUTTI i contesti (il default per le cooccorrenze). Le parole successive non sono rilevanti per questa prova. Solo il procedimento fino alla prima query 'lunga' è considerato. # Il tutto è eseguito sul DB 'di prova grande' ndg2.gat4, poi MODIFICATO per aggiungere un indice su Periodi. # Il file del DB (copiato in locale) dbFile = 'corpus.db' ## First query (notare che il DB è codificato) firstQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE 'Г%' ORDER BY idlem" def secondQuery(strlist): return f"SELECT DISTINCT lemma as codLemma, forma as codForma FROM pfl WHERE lemma IN ({strlist})" # The BAD query def theQuery(LIST1, LIST2): return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))' # A less BAD query def theQuerySimp(LIST1): return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1})' # LIST1, LIST2 sono forniti in degli array in due file json in questo folder # Dict factory for non-Pandas queries def dict_factory(cursor, row): fields = [column[0] for column in cursor.description] return {key: value for key, value in zip(fields, row)} # VettSpec for decoding vettSpec = getVettSpec('decoding/') # %% timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: results = pd.read_sql(firstQuery, connection) timestamp1 = time.time() db_results_decode_pandas(results, vettSpec) print(time.time() - timestamp0) print(timestamp1 - timestamp0) # %% timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: connection.row_factory = dict_factory queryReponse = connection.cursor().execute(firstQuery) resultsNoPandas = queryReponse.fetchall() timestamp1 = time.time() db_results_decode(resultsNoPandas, vettSpec) print(time.time() - timestamp0) print(timestamp1 - timestamp0) # %% timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: queryReponse = connection.cursor().execute(firstQuery) resultsRaw = queryReponse.fetchall() timestamp1 = time.time() resultsRawB = db_results_decode_nodict(resultsRaw, vettSpec) print(time.time() - timestamp0) print(timestamp1 - timestamp0) # %% codes = [res['cod'] for res in resultsNoPandas] codesStr = [str(code) for code in codes] # %% timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: results2 = pd.read_sql(secondQuery(','.join(codesStr)), connection) print(time.time() - timestamp0) # %% formCodes = list(results2['codForma']) formCodesStr = [str(code) for code in formCodes] # %% # Cross-check with open('query_list_1.json', 'r') as file1: codesFromFile = json.load(file1) with open('query_list_2.json', 'r') as file1: formCodesFromFile = json.load(file1) print(codes==codesFromFile) print(formCodes==formCodesFromFile) # %% # LUNGA query timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: tmpQuery = theQuery(','.join(codesStr), ','.join(formCodesStr)) results2 = pd.read_sql(tmpQuery, connection) db_results_decode_pandas(results2, vettSpec) print(time.time() - timestamp0) # %% # Dump dei risultati #with open('DeResult.json', 'w') as file1: # results2Dict = json.loads(results2.to_json(orient='records')) # json.dump(results2Dict, file1, indent=2) # %% # Sempre lunghina timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: tmpQuery = theQuerySimp(','.join(codesStr)) results2Simp = pd.read_sql(tmpQuery, connection) print(time.time() - timestamp0) # %% # VEDERE DI SEMPLIFICARE... # %% # A still less BAD query def theQuerySimp2(LIST1, LIST2): return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))' ''' NOTA: prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) ''' # %% # %% timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr)) results2Simp2_a = pd.read_sql(tmpQuery, connection) print(time.time() - timestamp0) # %% timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr)) results2Simp2_b = pd.read_sql(tmpQuery, connection) piniz = [] pfin = [] backup_piniz = [] backup_pfin = [] cur = connection.cursor() for index, row in results2Simp2_b.iterrows(): ntx = row['ntx'] prevMappa = row['mappa'] - 15 cur.execute(f'SELECT prev_tab.pitxt AS piniz FROM Occ00001 AS prev_tab WHERE prev_tab.ntx = {ntx} AND prev_tab.mappa = {prevMappa}') res = cur.fetchone() piniz.append(res[0] if res is not None else None) print(time.time() - timestamp0) # %% ############################ # Test with temporary table! ############################ timestamp0 = time.time() with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection: tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr)) querr = 'CREATE TEMPORARY TABLE stuff AS ' + tmpQuery connection.cursor().execute(querr) riQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-15' bisQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+15' trisQuery = f'SELECT * from stuff' quadrisQuery = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod' results2Simp2_c1 = pd.read_sql(riQuery, connection) results2Simp2_c2 = pd.read_sql(bisQuery, connection) results2Simp2_c3 = pd.read_sql(trisQuery, connection) results2Simp2_c4 = pd.read_sql(quadrisQuery, connection) results2Simp2_c3['piniz'] = results2Simp2_c1['piniz'] results2Simp2_c3['pfin'] = results2Simp2_c2['pfin'] results2Simp2_c3[['backup_piniz', 'backup_pfin']] = results2Simp2_c4[['backup_piniz', 'backup_pfin']] timestamp1 = time.time() db_results_decode_pandas(results2Simp2_c3, vettSpec) print(timestamp1 - timestamp0) print(time.time() - timestamp0) # %% #with open('DeResult_chk.json', 'w') as file1: # results2Simp2_c3_Dict = json.loads(results2Simp2_c3.to_json(orient='records')) # json.dump(results2Simp2_c3_Dict, file1, indent=2) # %%