actual_queries.py 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337
  1. import pandas as pd
  2. import itertools
  3. def prepareQuery(queryData):
  4. type = queryData.get('queryType') # KeyError protected -- returns None if the key is not defined
  5. #################
  6. if type=='occ_tables':
  7. return "SELECT name FROM sqlite_master WHERE type='table'"
  8. #################
  9. if type=='forma':
  10. try:
  11. data = queryData['data']
  12. dataNorm = queryData['dataNorm']
  13. except KeyError as err:
  14. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  15. joinedQueryData = "'" + "' OR spec LIKE '".join(data) + "'"
  16. if len(dataNorm)==0:
  17. return f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {joinedQueryData} ORDER BY idfor"
  18. else:
  19. joinedQueryDataNorm = "'" + "' OR norm LIKE '".join(dataNorm) + "'"
  20. return f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {joinedQueryData}) OR (norm LIKE {joinedQueryDataNorm}) ORDER BY idfor"
  21. ###################
  22. elif type=='lemma':
  23. try:
  24. data = queryData['data']
  25. dataNorm = queryData['dataNorm']
  26. except KeyError as err:
  27. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  28. joinedQueryData = "'" + "' OR spec LIKE '".join(data) + "'"
  29. if len(dataNorm)==0:
  30. return f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {joinedQueryData} ORDER BY idlem"
  31. else:
  32. joinedQueryDataNorm = "'" + "' OR norm LIKE '".join(dataNorm) + "'"
  33. return f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {joinedQueryData}) OR (norm LIKE {joinedQueryDataNorm}) ORDER BY idlem"
  34. ########################
  35. elif type=='lemmaForma':
  36. try:
  37. data = queryData['data']
  38. dataNorm = queryData['dataNorm']
  39. except KeyError as err:
  40. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  41. joinedQueryData = "'" + "' OR form.spec LIKE '".join(data) + "'"
  42. if len(dataNorm)==0:
  43. return f"SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE {joinedQueryData} ORDER BY lem.idlem"
  44. else:
  45. joinedQueryData = "'" + "' OR lem.spec LIKE '".join(data) + "'"
  46. joinedQueryDataNorm = "'" + "' OR lem.norm LIKE '".join(dataNorm) + "'"
  47. return f"SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE {joinedQueryData}) OR (lem.norm LIKE {joinedQueryDataNorm}) ORDER BY lem.idlem"
  48. ########################
  49. elif type=='formaLemma':
  50. try:
  51. data = queryData['data']
  52. dataNorm = queryData['dataNorm']
  53. except KeyError as err:
  54. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  55. joinedQueryData = "'" + "' OR form.spec LIKE '".join(data) + "'"
  56. if len(dataNorm)==0:
  57. return f"SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE {joinedQueryData} ORDER BY form.idfor"
  58. else:
  59. joinedQueryDataNorm = "'" + "' OR form.norm LIKE '".join(dataNorm) + "'"
  60. return f"SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE {joinedQueryData}) OR (form.norm LIKE {joinedQueryDataNorm}) ORDER BY form.idfor"
  61. #################
  62. elif type=='pfl':
  63. try:
  64. codList = queryData['codList']
  65. except KeyError as err:
  66. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  67. strlist = ",".join(str(c) for c in codList)
  68. return f"SELECT DISTINCT lemma as codLemma, forma as codForma FROM pfl WHERE lemma IN ({strlist})"
  69. ###################
  70. elif type=='texts':
  71. return complexQueryTexts
  72. ###################
  73. elif type=='co-occurrences':
  74. return complexQueryCooccurrences
  75. ######################
  76. elif type=='bib':
  77. try:
  78. row = queryData['row']
  79. sigla = row['sigla']
  80. except KeyError as err:
  81. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  82. return f"SELECT [Anno iniziale], [Anno finale], [Data codificata], [Titolo Abbreviato], [Autore], [Titolo], [Curatore], [Data descrittiva], [Area generica], [Area specifica], [Genere], [Forma], [Tipo], IQ FROM datibib WHERE Sigla='{sigla}'"
  83. #################
  84. elif type=='bibAlt':
  85. try:
  86. siglaSet = queryData['siglaSet']
  87. except KeyError as err:
  88. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  89. siglaStr = "'" + "','".join(siglaSet) + "'"
  90. return f"SELECT Sigla, [Anno iniziale], [Anno finale], [Data codificata], [Titolo Abbreviato], [Autore], [Titolo], [Curatore], [Data descrittiva], [Area generica], [Area specifica], [Genere], [Forma], [Tipo], IQ FROM datibib WHERE Sigla IN ({siglaStr})"
  91. #################
  92. elif type=='rif':
  93. try:
  94. row = queryData['row']
  95. numorg = row['numorg']
  96. ntx = row['ntx']
  97. except:
  98. return None
  99. return f"SELECT head AS Rif_organico, full AS Rif_completo FROM org WHERE (indice='{numorg}' AND ntx='{ntx}')"
  100. #################
  101. elif type=='rifAlt':
  102. return rifAlt
  103. #################
  104. elif type=='highlight':
  105. try:
  106. row = queryData['row']
  107. col = queryData['col']
  108. except:
  109. return None
  110. return f"SELECT spec as highlight FROM form WHERE cod={row[col]}"
  111. #################
  112. elif type =='singlecontext':
  113. try:
  114. subtype = queryData['querySubtype']
  115. table = queryData['table']
  116. parole = queryData['parole']
  117. periodi = queryData['periodi']
  118. brani = queryData['brani']
  119. ntxlocal = queryData['ntxlocal']
  120. mappalocal = queryData['mappalocal']
  121. periodlocal = queryData['periodlocal']
  122. numbranolocal = queryData['numbranolocal']
  123. except:
  124. return None
  125. if subtype == 'parole':
  126. return f"SELECT tab.pitxt, tab.elemlen FROM {table} AS tab WHERE tab.ntx = {ntxlocal} AND tab.mappa <= {mappalocal+int(parole/2)} AND tab.mappa >= {mappalocal-int(parole/2)}"
  127. elif subtype == 'periodi':
  128. return f"SELECT piniz, pfin FROM periodi WHERE ntx = {ntxlocal} AND numperiod <= {periodlocal+int(periodi/2)} AND numperiod >= {periodlocal-int(periodi/2)}"
  129. elif subtype == 'brani':
  130. return f"SELECT piniz, pfin FROM linkbase WHERE {ntxlocal} = ntx AND tipo = 2 AND id BETWEEN {numbranolocal-int(brani/2)} AND {numbranolocal+int(brani/2)}"
  131. #################
  132. elif type =='links':
  133. try:
  134. subtype = queryData['querySubtype']
  135. ntxlocal = queryData['ntxlocal']
  136. pinizlocal = queryData['pinizlocal']
  137. pfinlocal = queryData['pfinlocal']
  138. pitxtlocal = queryData['pitxtlocal']
  139. except:
  140. return None
  141. if subtype == 'nota':
  142. return f"SELECT ta.ntx, ta.id, ta.piniz, ta.pfin, tb.mappain, tb.mappafin FROM linkbase AS tb INNER JOIN linknoteass AS ta ON tb.ntx = ta.ntx AND tb.id = ta.id WHERE (((tb.tipo= 1) AND (tb.ntx = {ntxlocal})) AND ((tb.piniz BETWEEN {pinizlocal} AND {pfinlocal}) OR ({pitxtlocal} BETWEEN tb.piniz AND tb.pfin)))"
  143. if subtype == 'testo_associato':
  144. return f"SELECT ta.ntx, ta.id, ta.piniz, ta.pfin, tb.mappain, tb.mappafin FROM linkbase AS tb INNER JOIN linknoteass AS ta ON tb.ntx = ta.ntx AND tb.id = ta.id WHERE (((tb.tipo= 2) AND (tb.ntx = {ntxlocal})) AND ((tb.piniz BETWEEN {pinizlocal} AND {pfinlocal}) OR ({pitxtlocal} BETWEEN tb.piniz AND tb.pfin)))"
  145. #####
  146. else:
  147. raise ValueError('Unrecognized query type: ' + type)
  148. def rifAlt(connection, queryData):
  149. try:
  150. coordsSet = queryData['coordsSet']
  151. except KeyError as err:
  152. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  153. subQueries = []
  154. for coords in coordsSet:
  155. try:
  156. numorg = coords[0]
  157. ntx = coords[1]
  158. except IndexError as err:
  159. raise KeyError('Incomplete required data for query type ' + type + ': ' + str(err))
  160. subQueries.append( f"SELECT indice AS numorg, ntx, head AS Rif_organico, full AS Rif_completo FROM org WHERE (indice='{numorg}' AND ntx='{ntx}')" )
  161. pandas = queryData.get('pandas')
  162. resultList = []
  163. for query in subQueries:
  164. if pandas:
  165. resultList.append( pd.read_sql(query, connection) )
  166. else:
  167. connection.row_factory = dict_factory
  168. queryReponse = connection.cursor().execute(query)
  169. resultList.append( queryReponse.fetchall() )
  170. if pandas:
  171. results = pd.concat(resultList)
  172. else:
  173. results = list(itertools.chain.from_iterable(resultList))
  174. return results
  175. def complexQueryTexts(connection, queryData):
  176. try:
  177. codList = queryData['codList']
  178. table = queryData['table']
  179. subtype = queryData['querySubtype']
  180. formCodList = queryData.get('formCodList') # KeyError-safe (None if absent)
  181. except KeyError as err:
  182. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  183. strCodList = ",".join(str(c) for c in codList)
  184. # Main query, verified to be fast!
  185. mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod"
  186. if subtype==0:
  187. condition = f"WHERE tab.cod IN ({strCodList})"
  188. elif subtype==1:
  189. condition = f"WHERE tab.indlem IN ({strCodList})"
  190. elif subtype==2:
  191. if formCodList is None:
  192. return None
  193. strFormCodList = ",".join(str(c) for c in formCodList)
  194. condition = f" WHERE tab.indlem IN ({strCodList}) OR (tab.indlem = 0 AND tab.cod IN ({strFormCodList}))"
  195. mainQueryString = f'{mainQueryString} {condition}'
  196. # This value can be changed to change multiple contexts width. Default value for Gatto is parole=31 #
  197. parole = 31
  198. # C'è la possibilità di scegliere periodi invece che parole, ma per il momento è disabilitata
  199. createTempTable = f'CREATE TEMPORARY TABLE stuff AS {mainQueryString}'
  200. mainQuery = f'SELECT * from stuff'
  201. addQuery1 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-{int(parole/2)}'
  202. addQuery2 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+{int(parole/2)}'
  203. addQuery3 = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
  204. # Start communication with DB
  205. connection.cursor().execute(createTempTable)
  206. results = pd.read_sql(mainQuery, connection)
  207. results_add1 = pd.read_sql(addQuery1, connection)
  208. results_add2 = pd.read_sql(addQuery2, connection)
  209. results_add3 = pd.read_sql(addQuery3, connection)
  210. results['piniz'] = results_add1['piniz']
  211. results['pfin'] = results_add2['pfin']
  212. results[['backup_piniz', 'backup_pfin']] = results_add3[['backup_piniz', 'backup_pfin']]
  213. return results
  214. def complexQueryCooccurrences(connection, queryData):
  215. try:
  216. # the get method for dicts is KeyError-safe (returns None if key is absent)
  217. occurrences = queryData['occurrences']
  218. table = queryData['table']
  219. intervallo = queryData['intervallo']
  220. periodo = queryData.get('periodo') # Unused for the moment
  221. ordinate = queryData.get('ordinate') # Unused for the moment
  222. if periodo is None:
  223. periodo = 0
  224. if ordinate is None:
  225. ordinate = 0
  226. except KeyError as err:
  227. raise KeyError('Missing required data for query: ' + str(err))
  228. # Main part of main query -- verified to be fast!
  229. preMainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod"
  230. # Main loop on the different occurrences searched by user
  231. pitxtList = ['pitxt']
  232. elemlenList = ['elemlen']
  233. for index, occ in enumerate(occurrences):
  234. try:
  235. subtype = occ['querySubtype']
  236. codList = occ['codList']
  237. formCodList = occ.get('formCodList')
  238. except KeyError as err:
  239. raise KeyError('Missing required data for query: ' + str(err))
  240. strCodList = ",".join(str(c) for c in codList)
  241. if subtype==0:
  242. condition = f" WHERE tab.cod IN ({strCodList})"
  243. elif subtype==1:
  244. condition = f" WHERE tab.indlem IN ({strCodList})"
  245. elif subtype==2:
  246. if formCodList is None:
  247. return None
  248. strFormCodList = ",".join(str(c) for c in formCodList)
  249. condition = f" WHERE tab.indlem IN ({strCodList}) OR (tab.indlem = 0 AND tab.cod IN ({strFormCodList}))"
  250. mainQueryString = f'{preMainQueryString} {condition}'
  251. # First occurrence:
  252. if index==0:
  253. # Create a temporary table for results
  254. resTable = 'tempOcc_' + str(index)
  255. connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS {mainQueryString}')
  256. connection.cursor().execute(f'CREATE INDEX aa_{index} ON {resTable} (ntx, mappa)')
  257. continue
  258. else:
  259. # update results
  260. connection.cursor().execute(f'CREATE TEMPORARY TABLE tempOccB AS {mainQueryString}')
  261. connection.cursor().execute(f'CREATE INDEX bb ON tempOccB (ntx, mappa)')
  262. oldTable = resTable
  263. resTable = 'tempOcc_' + str(index)
  264. connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS SELECT tabA.cod, tabA.ntx, tabA.{" tabA.".join(pitxtList)}, tabA.{" tabA.".join(elemlenList)}, tabA.mappa, tabA.numperiod, tabA.links, tabA.numorg, tabA.sigla, tabA.vol, tabA.pag, tabA.riga, tabA.col, tabA.tipostanza, tabA.stanza, tabA.verso, tabA.numbrano, tabA.lemma, tabA.cat_gr, tabA.disambiguatore, tabB.ntx AS ntx2, tabB.mappa AS mappa2, tabB.pitxt as pitxt_{index}, tabB.elemlen as elemlen_{index} FROM {oldTable} AS tabA, tempOccB AS tabB WHERE tabA.ntx=tabB.ntx AND tabA.mappa BETWEEN tabB.mappa-{intervallo} AND tabB.mappa+{intervallo} AND tabA.mappa != tabB.mappa')
  265. connection.cursor().execute(f'CREATE INDEX aa_{index} ON {resTable} (ntx, mappa)')
  266. connection.cursor().execute(f'DROP TABLE {oldTable}')
  267. pitxtList.append(f'pitxt_{index}')
  268. elemlenList.append(f'elemlen_{index}')
  269. results = pd.read_sql(f'SELECT * FROM {resTable}', connection)
  270. # This value can be changed to change multiple contexts width. Default value for Gatto is parole=31
  271. parole = 31
  272. # C'è la possibilità di scegliere periodi invece che parole, ma per il momento è disabilitata
  273. queryPiniz = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM {resTable} AS stuff LEFT JOIN {table} AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-{int(parole/2)}'
  274. queryPfin = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM {resTable} AS stuff LEFT JOIN {table} AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+{int(parole/2)}'
  275. queryPeriodi = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM {resTable} AS stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
  276. resultsPiniz = pd.read_sql(queryPiniz, connection)
  277. resultsPfin = pd.read_sql(queryPfin, connection)
  278. resultsPeriodi = pd.read_sql(queryPeriodi, connection)
  279. results['piniz'] = resultsPiniz['piniz']
  280. results['pfin'] = resultsPfin['pfin']
  281. results[['backup_piniz', 'backup_pfin']] = resultsPeriodi[['backup_piniz', 'backup_pfin']]
  282. return results
  283. # Dict factory non-Pandas queries
  284. def dict_factory(cursor, row):
  285. fields = [column[0] for column in cursor.description]
  286. return {key: value for key, value in zip(fields, row)}