123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187 |
- import json
- import sqlite3
- import pandas as pd
- import time
- from decoding.decoding import getVettSpec, db_results_decode_pandas, db_results_decode, db_results_decode_nodict
- dbFile = 'corpus.db'
- firstQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE 'Г%' ORDER BY idlem"
- def secondQuery(strlist):
- return f"SELECT DISTINCT lemma as codLemma, forma as codForma FROM pfl WHERE lemma IN ({strlist})"
- def theQuery(LIST1, LIST2):
- return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))'
- def theQuerySimp(LIST1):
- return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1})'
- def dict_factory(cursor, row):
- fields = [column[0] for column in cursor.description]
- return {key: value for key, value in zip(fields, row)}
- vettSpec = getVettSpec('decoding/')
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- results = pd.read_sql(firstQuery, connection)
- timestamp1 = time.time()
- db_results_decode_pandas(results, vettSpec)
- print(time.time() - timestamp0)
- print(timestamp1 - timestamp0)
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- connection.row_factory = dict_factory
- queryReponse = connection.cursor().execute(firstQuery)
- resultsNoPandas = queryReponse.fetchall()
- timestamp1 = time.time()
- db_results_decode(resultsNoPandas, vettSpec)
- print(time.time() - timestamp0)
- print(timestamp1 - timestamp0)
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- queryReponse = connection.cursor().execute(firstQuery)
- resultsRaw = queryReponse.fetchall()
- timestamp1 = time.time()
- resultsRawB = db_results_decode_nodict(resultsRaw, vettSpec)
- print(time.time() - timestamp0)
- print(timestamp1 - timestamp0)
- codes = [res['cod'] for res in resultsNoPandas]
- codesStr = [str(code) for code in codes]
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- results2 = pd.read_sql(secondQuery(','.join(codesStr)), connection)
- print(time.time() - timestamp0)
- formCodes = list(results2['codForma'])
- formCodesStr = [str(code) for code in formCodes]
- with open('query_list_1.json', 'r') as file1:
- codesFromFile = json.load(file1)
- with open('query_list_2.json', 'r') as file1:
- formCodesFromFile = json.load(file1)
- print(codes==codesFromFile)
- print(formCodes==formCodesFromFile)
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuery(','.join(codesStr), ','.join(formCodesStr))
- results2 = pd.read_sql(tmpQuery, connection)
- db_results_decode_pandas(results2, vettSpec)
- print(time.time() - timestamp0)
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp(','.join(codesStr))
- results2Simp = pd.read_sql(tmpQuery, connection)
- print(time.time() - timestamp0)
- def theQuerySimp2(LIST1, LIST2):
- return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))'
- '''
- NOTA:
- prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin
- LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15)
- LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15)
- LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod)
- '''
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
- results2Simp2_a = pd.read_sql(tmpQuery, connection)
- print(time.time() - timestamp0)
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
- results2Simp2_b = pd.read_sql(tmpQuery, connection)
- piniz = []
- pfin = []
- backup_piniz = []
- backup_pfin = []
- cur = connection.cursor()
- for index, row in results2Simp2_b.iterrows():
- ntx = row['ntx']
- prevMappa = row['mappa'] - 15
- cur.execute(f'SELECT prev_tab.pitxt AS piniz FROM Occ00001 AS prev_tab WHERE prev_tab.ntx = {ntx} AND prev_tab.mappa = {prevMappa}')
- res = cur.fetchone()
- piniz.append(res[0] if res is not None else None)
- print(time.time() - timestamp0)
- timestamp0 = time.time()
- with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
- tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
- querr = 'CREATE TEMPORARY TABLE stuff AS ' + tmpQuery
- connection.cursor().execute(querr)
- riQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-15'
- bisQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+15'
- trisQuery = f'SELECT * from stuff'
- quadrisQuery = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
- results2Simp2_c1 = pd.read_sql(riQuery, connection)
- results2Simp2_c2 = pd.read_sql(bisQuery, connection)
- results2Simp2_c3 = pd.read_sql(trisQuery, connection)
- results2Simp2_c4 = pd.read_sql(quadrisQuery, connection)
- results2Simp2_c3['piniz'] = results2Simp2_c1['piniz']
- results2Simp2_c3['pfin'] = results2Simp2_c2['pfin']
- results2Simp2_c3[['backup_piniz', 'backup_pfin']] = results2Simp2_c4[['backup_piniz', 'backup_pfin']]
- timestamp1 = time.time()
- db_results_decode_pandas(results2Simp2_c3, vettSpec)
- print(timestamp1 - timestamp0)
- print(time.time() - timestamp0)
|