123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- # %%
- import json
- import sqlite3
- import pandas as pd
- import time
- from decoding.decoding import getVettSpec, db_results_decode_pandas, db_results_decode, db_results_decode_nodict
- # %%
- #
- # Ricerca: copiata dalla ricerca per cooccorrenze, versione 18 Maggio 2023 Develop, con prima prima 'c*' come Lemma (con forme non lemmatizzate, il default) -- un sacco di risultati! -- seguita da un findtext automatico per TUTTI i contesti (il default per le cooccorrenze). Le parole successive non sono rilevanti per questa prova. Solo il procedimento fino alla prima query 'lunga' è considerato.
- # Il tutto è eseguito sul DB 'di prova grande' ndg2.gat4, poi MODIFICATO per aggiungere un indice su Periodi.
- # Il file del DB (copiato in locale)
- dbFile = 'corpus.db'
- ## First query (notare che il DB è codificato)
- firstQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE 'Г%' ORDER BY idlem"
- def secondQuery(strlist):
- return f"SELECT DISTINCT lemma as codLemma, forma as codForma FROM pfl WHERE lemma IN ({strlist})"
- # The BAD query
- def theQuery(LIST1, LIST2):
- return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))'
- # A less BAD query
- def theQuerySimp(LIST1):
- return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1})'
- # LIST1, LIST2 sono forniti in degli array in due file json in questo folder
- # Dict factory for non-Pandas queries
- def dict_factory(cursor, row):
- fields = [column[0] for column in cursor.description]
- return {key: value for key, value in zip(fields, row)}
- # VettSpec for decoding
- vettSpec = getVettSpec('decoding/')
- # %%
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- results = pd.read_sql(firstQuery, connection)
- timestamp1 = time.time()
- db_results_decode_pandas(results, vettSpec)
- print(time.time() - timestamp0)
- print(timestamp1 - timestamp0)
- # %%
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- connection.row_factory = dict_factory
- queryReponse = connection.cursor().execute(firstQuery)
- resultsNoPandas = queryReponse.fetchall()
- timestamp1 = time.time()
- db_results_decode(resultsNoPandas, vettSpec)
- print(time.time() - timestamp0)
- print(timestamp1 - timestamp0)
- # %%
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- queryReponse = connection.cursor().execute(firstQuery)
- resultsRaw = queryReponse.fetchall()
- timestamp1 = time.time()
- resultsRawB = db_results_decode_nodict(resultsRaw, vettSpec)
- print(time.time() - timestamp0)
- print(timestamp1 - timestamp0)
- # %%
- codes = [res['cod'] for res in resultsNoPandas]
- codesStr = [str(code) for code in codes]
- # %%
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- results2 = pd.read_sql(secondQuery(','.join(codesStr)), connection)
- print(time.time() - timestamp0)
- # %%
- formCodes = list(results2['codForma'])
- formCodesStr = [str(code) for code in formCodes]
- # %%
- # Cross-check
- with open('query_list_1.json', 'r') as file1:
- codesFromFile = json.load(file1)
- with open('query_list_2.json', 'r') as file1:
- formCodesFromFile = json.load(file1)
- print(codes==codesFromFile)
- print(formCodes==formCodesFromFile)
- # %%
- # LUNGA query
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuery(','.join(codesStr), ','.join(formCodesStr))
- results2 = pd.read_sql(tmpQuery, connection)
- db_results_decode_pandas(results2, vettSpec)
- print(time.time() - timestamp0)
- # %%
- # Dump dei risultati
- #with open('DeResult.json', 'w') as file1:
- # results2Dict = json.loads(results2.to_json(orient='records'))
- # json.dump(results2Dict, file1, indent=2)
- # %%
- # Sempre lunghina
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp(','.join(codesStr))
- results2Simp = pd.read_sql(tmpQuery, connection)
- print(time.time() - timestamp0)
- # %%
- # VEDERE DI SEMPLIFICARE...
- # %%
- # A still less BAD query
- def theQuerySimp2(LIST1, LIST2):
- return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))'
- '''
- NOTA:
- prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin
- LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15)
- LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15)
- LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod)
- '''
- # %%
- # %%
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
- results2Simp2_a = pd.read_sql(tmpQuery, connection)
- print(time.time() - timestamp0)
- # %%
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
- results2Simp2_b = pd.read_sql(tmpQuery, connection)
- piniz = []
- pfin = []
- backup_piniz = []
- backup_pfin = []
- cur = connection.cursor()
- for index, row in results2Simp2_b.iterrows():
- ntx = row['ntx']
- prevMappa = row['mappa'] - 15
- cur.execute(f'SELECT prev_tab.pitxt AS piniz FROM Occ00001 AS prev_tab WHERE prev_tab.ntx = {ntx} AND prev_tab.mappa = {prevMappa}')
- res = cur.fetchone()
- piniz.append(res[0] if res is not None else None)
- print(time.time() - timestamp0)
- # %%
- ############################
- # Test with temporary table!
- ############################
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
- querr = 'CREATE TEMPORARY TABLE stuff AS ' + tmpQuery
- connection.cursor().execute(querr)
- riQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-15'
- bisQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+15'
- trisQuery = f'SELECT * from stuff'
- quadrisQuery = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
- results2Simp2_c1 = pd.read_sql(riQuery, connection)
- results2Simp2_c2 = pd.read_sql(bisQuery, connection)
- results2Simp2_c3 = pd.read_sql(trisQuery, connection)
- results2Simp2_c4 = pd.read_sql(quadrisQuery, connection)
- results2Simp2_c3['piniz'] = results2Simp2_c1['piniz']
- results2Simp2_c3['pfin'] = results2Simp2_c2['pfin']
- results2Simp2_c3[['backup_piniz', 'backup_pfin']] = results2Simp2_c4[['backup_piniz', 'backup_pfin']]
- timestamp1 = time.time()
- db_results_decode_pandas(results2Simp2_c3, vettSpec)
- print(timestamp1 - timestamp0)
- print(time.time() - timestamp0)
- # %%
- #with open('DeResult_chk.json', 'w') as file1:
- # results2Simp2_c3_Dict = json.loads(results2Simp2_c3.to_json(orient='records'))
- # json.dump(results2Simp2_c3_Dict, file1, indent=2)
- # %%
|