queries.py 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187
  1. # %%
  2. import json
  3. import sqlite3
  4. import pandas as pd
  5. import time
  6. from decoding.decoding import getVettSpec, db_results_decode_pandas, db_results_decode, db_results_decode_nodict
  7. # %%
  8. #
  9. # Ricerca: copiata dalla ricerca per cooccorrenze, versione 18 Maggio 2023 Develop, con prima prima 'c*' come Lemma (con forme non lemmatizzate, il default) -- un sacco di risultati! -- seguita da un findtext automatico per TUTTI i contesti (il default per le cooccorrenze). Le parole successive non sono rilevanti per questa prova. Solo il procedimento fino alla prima query 'lunga' è considerato.
  10. # Il tutto è eseguito sul DB 'di prova grande' ndg2.gat4, poi MODIFICATO per aggiungere un indice su Periodi.
  11. # Il file del DB (copiato in locale)
  12. dbFile = 'corpus.db'
  13. ## First query (notare che il DB è codificato)
  14. firstQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE 'Г%' ORDER BY idlem"
  15. def secondQuery(strlist):
  16. return f"SELECT DISTINCT lemma as codLemma, forma as codForma FROM pfl WHERE lemma IN ({strlist})"
  17. # The BAD query
  18. def theQuery(LIST1, LIST2):
  19. return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))'
  20. # A less BAD query
  21. def theQuerySimp(LIST1):
  22. return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15) LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15) LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod) WHERE tab.indlem IN ({LIST1})'
  23. # LIST1, LIST2 sono forniti in degli array in due file json in questo folder
  24. # Dict factory for non-Pandas queries
  25. def dict_factory(cursor, row):
  26. fields = [column[0] for column in cursor.description]
  27. return {key: value for key, value in zip(fields, row)}
  28. # VettSpec for decoding
  29. vettSpec = getVettSpec('decoding/')
  30. # %%
  31. timestamp0 = time.time()
  32. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  33. results = pd.read_sql(firstQuery, connection)
  34. timestamp1 = time.time()
  35. db_results_decode_pandas(results, vettSpec)
  36. print(time.time() - timestamp0)
  37. print(timestamp1 - timestamp0)
  38. # %%
  39. timestamp0 = time.time()
  40. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  41. connection.row_factory = dict_factory
  42. queryReponse = connection.cursor().execute(firstQuery)
  43. resultsNoPandas = queryReponse.fetchall()
  44. timestamp1 = time.time()
  45. db_results_decode(resultsNoPandas, vettSpec)
  46. print(time.time() - timestamp0)
  47. print(timestamp1 - timestamp0)
  48. # %%
  49. timestamp0 = time.time()
  50. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  51. queryReponse = connection.cursor().execute(firstQuery)
  52. resultsRaw = queryReponse.fetchall()
  53. timestamp1 = time.time()
  54. resultsRawB = db_results_decode_nodict(resultsRaw, vettSpec)
  55. print(time.time() - timestamp0)
  56. print(timestamp1 - timestamp0)
  57. # %%
  58. codes = [res['cod'] for res in resultsNoPandas]
  59. codesStr = [str(code) for code in codes]
  60. # %%
  61. timestamp0 = time.time()
  62. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  63. results2 = pd.read_sql(secondQuery(','.join(codesStr)), connection)
  64. print(time.time() - timestamp0)
  65. # %%
  66. formCodes = list(results2['codForma'])
  67. formCodesStr = [str(code) for code in formCodes]
  68. # %%
  69. # Cross-check
  70. with open('query_list_1.json', 'r') as file1:
  71. codesFromFile = json.load(file1)
  72. with open('query_list_2.json', 'r') as file1:
  73. formCodesFromFile = json.load(file1)
  74. print(codes==codesFromFile)
  75. print(formCodes==formCodesFromFile)
  76. # %%
  77. # LUNGA query
  78. timestamp0 = time.time()
  79. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  80. tmpQuery = theQuery(','.join(codesStr), ','.join(formCodesStr))
  81. results2 = pd.read_sql(tmpQuery, connection)
  82. db_results_decode_pandas(results2, vettSpec)
  83. print(time.time() - timestamp0)
  84. # %%
  85. # Dump dei risultati
  86. #with open('DeResult.json', 'w') as file1:
  87. # results2Dict = json.loads(results2.to_json(orient='records'))
  88. # json.dump(results2Dict, file1, indent=2)
  89. # %%
  90. # Sempre lunghina
  91. timestamp0 = time.time()
  92. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  93. tmpQuery = theQuerySimp(','.join(codesStr))
  94. results2Simp = pd.read_sql(tmpQuery, connection)
  95. print(time.time() - timestamp0)
  96. # %%
  97. # VEDERE DI SEMPLIFICARE...
  98. # %%
  99. # A still less BAD query
  100. def theQuerySimp2(LIST1, LIST2):
  101. return f'SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM Occ00001 AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({LIST1}) OR (tab.indlem = 0 AND tab.cod IN ({LIST2}))'
  102. '''
  103. NOTA:
  104. prev_tab.pitxt AS piniz, next_tab.pitxt AS pfin, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin
  105. LEFT JOIN Occ00001 AS prev_tab ON (tab.ntx = prev_tab.ntx AND tab.mappa = prev_tab.mappa+15)
  106. LEFT JOIN Occ00001 AS next_tab ON (tab.ntx = next_tab.ntx AND tab.mappa = next_tab.mappa-15)
  107. LEFT JOIN periodi ON (tab.ntx = periodi.ntx AND tab.numperiod = periodi.numperiod)
  108. '''
  109. # %%
  110. # %%
  111. timestamp0 = time.time()
  112. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  113. tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
  114. results2Simp2_a = pd.read_sql(tmpQuery, connection)
  115. print(time.time() - timestamp0)
  116. # %%
  117. timestamp0 = time.time()
  118. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  119. tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
  120. results2Simp2_b = pd.read_sql(tmpQuery, connection)
  121. piniz = []
  122. pfin = []
  123. backup_piniz = []
  124. backup_pfin = []
  125. cur = connection.cursor()
  126. for index, row in results2Simp2_b.iterrows():
  127. ntx = row['ntx']
  128. prevMappa = row['mappa'] - 15
  129. cur.execute(f'SELECT prev_tab.pitxt AS piniz FROM Occ00001 AS prev_tab WHERE prev_tab.ntx = {ntx} AND prev_tab.mappa = {prevMappa}')
  130. res = cur.fetchone()
  131. piniz.append(res[0] if res is not None else None)
  132. print(time.time() - timestamp0)
  133. # %%
  134. ############################
  135. # Test with temporary table!
  136. ############################
  137. timestamp0 = time.time()
  138. with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
  139. tmpQuery = theQuerySimp2(','.join(codesStr), ','.join(formCodesStr))
  140. querr = 'CREATE TEMPORARY TABLE stuff AS ' + tmpQuery
  141. connection.cursor().execute(querr)
  142. riQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-15'
  143. bisQuery = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+15'
  144. trisQuery = f'SELECT * from stuff'
  145. quadrisQuery = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
  146. results2Simp2_c1 = pd.read_sql(riQuery, connection)
  147. results2Simp2_c2 = pd.read_sql(bisQuery, connection)
  148. results2Simp2_c3 = pd.read_sql(trisQuery, connection)
  149. results2Simp2_c4 = pd.read_sql(quadrisQuery, connection)
  150. results2Simp2_c3['piniz'] = results2Simp2_c1['piniz']
  151. results2Simp2_c3['pfin'] = results2Simp2_c2['pfin']
  152. results2Simp2_c3[['backup_piniz', 'backup_pfin']] = results2Simp2_c4[['backup_piniz', 'backup_pfin']]
  153. timestamp1 = time.time()
  154. db_results_decode_pandas(results2Simp2_c3, vettSpec)
  155. print(timestamp1 - timestamp0)
  156. print(time.time() - timestamp0)
  157. # %%
  158. #with open('DeResult_chk.json', 'w') as file1:
  159. # results2Simp2_c3_Dict = json.loads(results2Simp2_c3.to_json(orient='records'))
  160. # json.dump(results2Simp2_c3_Dict, file1, indent=2)
  161. # %%