import pandas as pd import itertools def prepareQuery(queryData): type = queryData.get('queryType') # KeyError protected -- returns None if the key is not defined ################# if type=='occ_tables': return "SELECT name FROM sqlite_master WHERE type='table'" ################# if type=='forma': try: data = queryData['data'] dataNorm = queryData['dataNorm'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) joinedQueryData = "'" + "' OR spec LIKE '".join(data) + "'" if len(dataNorm)==0: return f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {joinedQueryData} ORDER BY idfor" else: joinedQueryDataNorm = "'" + "' OR norm LIKE '".join(dataNorm) + "'" return f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {joinedQueryData}) OR (norm LIKE {joinedQueryDataNorm}) ORDER BY idfor" ################### elif type=='lemma': try: data = queryData['data'] dataNorm = queryData['dataNorm'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) joinedQueryData = "'" + "' OR spec LIKE '".join(data) + "'" if len(dataNorm)==0: return f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {joinedQueryData} ORDER BY idlem" else: joinedQueryDataNorm = "'" + "' OR norm LIKE '".join(dataNorm) + "'" return f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {joinedQueryData}) OR (norm LIKE {joinedQueryDataNorm}) ORDER BY idlem" ######################## elif type=='lemmaForma': try: data = queryData['data'] dataNorm = queryData['dataNorm'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) joinedQueryData = "'" + "' OR form.spec LIKE '".join(data) + "'" if len(dataNorm)==0: return f"SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE {joinedQueryData} ORDER BY lem.idlem" else: joinedQueryData = "'" + "' OR lem.spec LIKE '".join(data) + "'" joinedQueryDataNorm = "'" + "' OR lem.norm LIKE '".join(dataNorm) + "'" return f"SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE {joinedQueryData}) OR (lem.norm LIKE {joinedQueryDataNorm}) ORDER BY lem.idlem" ######################## elif type=='formaLemma': try: data = queryData['data'] dataNorm = queryData['dataNorm'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) joinedQueryData = "'" + "' OR form.spec LIKE '".join(data) + "'" if len(dataNorm)==0: return f"SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE {joinedQueryData} ORDER BY form.idfor" else: joinedQueryDataNorm = "'" + "' OR form.norm LIKE '".join(dataNorm) + "'" return f"SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE {joinedQueryData}) OR (form.norm LIKE {joinedQueryDataNorm}) ORDER BY form.idfor" ################# elif type=='pfl': try: codList = queryData['codList'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) strlist = ",".join(str(c) for c in codList) return f"SELECT DISTINCT lemma as codLemma, forma as codForma FROM pfl WHERE lemma IN ({strlist})" ################### elif type=='texts': return complexQueryTexts ################### elif type=='co-occurrences': return complexQueryCooccurrences ###################### elif type=='bib': try: row = queryData['row'] sigla = row['sigla'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) return f"SELECT [Anno iniziale], [Anno finale], [Data codificata], [Titolo Abbreviato], [Autore], [Titolo], [Curatore], [Data descrittiva], [Area generica], [Area specifica], [Genere], [Forma], [Tipo], IQ FROM datibib WHERE Sigla='{sigla}'" ################# elif type=='bibAlt': try: siglaSet = queryData['siglaSet'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) siglaStr = "'" + "','".join(siglaSet) + "'" return f"SELECT Sigla, [Anno iniziale], [Anno finale], [Data codificata], [Titolo Abbreviato], [Autore], [Titolo], [Curatore], [Data descrittiva], [Area generica], [Area specifica], [Genere], [Forma], [Tipo], IQ FROM datibib WHERE Sigla IN ({siglaStr})" ################# elif type=='rif': try: row = queryData['row'] numorg = row['numorg'] ntx = row['ntx'] except: return None return f"SELECT head AS Rif_organico, full AS Rif_completo FROM org WHERE (indice='{numorg}' AND ntx='{ntx}')" ################# elif type=='rifAlt': return rifAlt ################# elif type=='highlight': try: row = queryData['row'] col = queryData['col'] except: return None return f"SELECT spec as highlight FROM form WHERE cod={row[col]}" ################# elif type =='singlecontext': try: subtype = queryData['querySubtype'] table = queryData['table'] parole = queryData['parole'] periodi = queryData['periodi'] brani = queryData['brani'] ntxlocal = queryData['ntxlocal'] mappalocal = queryData['mappalocal'] periodlocal = queryData['periodlocal'] numbranolocal = queryData['numbranolocal'] except: return None if subtype == 'parole': return f"SELECT tab.pitxt, tab.elemlen FROM {table} AS tab WHERE tab.ntx = {ntxlocal} AND tab.mappa <= {mappalocal+int(parole/2)} AND tab.mappa >= {mappalocal-int(parole/2)}" elif subtype == 'periodi': return f"SELECT piniz, pfin FROM periodi WHERE ntx = {ntxlocal} AND numperiod <= {periodlocal+int(periodi/2)} AND numperiod >= {periodlocal-int(periodi/2)}" elif subtype == 'brani': return f"SELECT piniz, pfin FROM linkbase WHERE {ntxlocal} = ntx AND tipo = 2 AND id BETWEEN {numbranolocal-int(brani/2)} AND {numbranolocal+int(brani/2)}" ################# elif type =='links': try: subtype = queryData['querySubtype'] ntxlocal = queryData['ntxlocal'] pinizlocal = queryData['pinizlocal'] pfinlocal = queryData['pfinlocal'] pitxtlocal = queryData['pitxtlocal'] except: return None if subtype == 'nota': return f"SELECT ta.ntx, ta.id, ta.piniz, ta.pfin, tb.mappain, tb.mappafin FROM linkbase AS tb INNER JOIN linknoteass AS ta ON tb.ntx = ta.ntx AND tb.id = ta.id WHERE (((tb.tipo= 1) AND (tb.ntx = {ntxlocal})) AND ((tb.piniz BETWEEN {pinizlocal} AND {pfinlocal}) OR ({pitxtlocal} BETWEEN tb.piniz AND tb.pfin)))" if subtype == 'testo_associato': return f"SELECT ta.ntx, ta.id, ta.piniz, ta.pfin, tb.mappain, tb.mappafin FROM linkbase AS tb INNER JOIN linknoteass AS ta ON tb.ntx = ta.ntx AND tb.id = ta.id WHERE (((tb.tipo= 2) AND (tb.ntx = {ntxlocal})) AND ((tb.piniz BETWEEN {pinizlocal} AND {pfinlocal}) OR ({pitxtlocal} BETWEEN tb.piniz AND tb.pfin)))" ##### else: raise ValueError('Unrecognized query type: ' + type) def rifAlt(connection, queryData): try: coordsSet = queryData['coordsSet'] except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) subQueries = [] for coords in coordsSet: try: numorg = coords[0] ntx = coords[1] except IndexError as err: raise KeyError('Incomplete required data for query type ' + type + ': ' + str(err)) subQueries.append( f"SELECT indice AS numorg, ntx, head AS Rif_organico, full AS Rif_completo FROM org WHERE (indice='{numorg}' AND ntx='{ntx}')" ) pandas = queryData.get('pandas') resultList = [] for query in subQueries: if pandas: resultList.append( pd.read_sql(query, connection) ) else: connection.row_factory = dict_factory queryReponse = connection.cursor().execute(query) resultList.append( queryReponse.fetchall() ) if pandas: results = pd.concat(resultList) else: results = list(itertools.chain.from_iterable(resultList)) return results def complexQueryTexts(connection, queryData): try: codList = queryData['codList'] table = queryData['table'] subtype = queryData['querySubtype'] formCodList = queryData.get('formCodList') # KeyError-safe (None if absent) except KeyError as err: raise KeyError('Missing required data for query type ' + type + ': ' + str(err)) strCodList = ",".join(str(c) for c in codList) # Main query, verified to be fast! mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod" if subtype==0: condition = f"WHERE tab.cod IN ({strCodList})" elif subtype==1: condition = f"WHERE tab.indlem IN ({strCodList})" elif subtype==2: if formCodList is None: return None strFormCodList = ",".join(str(c) for c in formCodList) condition = f" WHERE tab.indlem IN ({strCodList}) OR (tab.indlem = 0 AND tab.cod IN ({strFormCodList}))" mainQueryString = f'{mainQueryString} {condition}' # This value can be changed to change multiple contexts width. Default value for Gatto is parole=31 # parole = 31 # C'è la possibilità di scegliere periodi invece che parole, ma per il momento è disabilitata createTempTable = f'CREATE TEMPORARY TABLE stuff AS {mainQueryString}' mainQuery = f'SELECT * from stuff' addQuery1 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-{int(parole/2)}' addQuery2 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+{int(parole/2)}' addQuery3 = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod' # Start communication with DB connection.cursor().execute(createTempTable) results = pd.read_sql(mainQuery, connection) results_add1 = pd.read_sql(addQuery1, connection) results_add2 = pd.read_sql(addQuery2, connection) results_add3 = pd.read_sql(addQuery3, connection) results['piniz'] = results_add1['piniz'] results['pfin'] = results_add2['pfin'] results[['backup_piniz', 'backup_pfin']] = results_add3[['backup_piniz', 'backup_pfin']] return results def complexQueryCooccurrences(connection, queryData): try: # the get method for dicts is KeyError-safe (returns None if key is absent) occurrences = queryData['occurrences'] table = queryData['table'] intervallo = queryData['intervallo'] periodo = queryData.get('periodo') # Unused for the moment ordinate = queryData.get('ordinate') # Unused for the moment if periodo is None: periodo = 0 if ordinate is None: ordinate = 0 except KeyError as err: raise KeyError('Missing required data for query: ' + str(err)) # Main part of main query -- verified to be fast! preMainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod" # Main loop on the different occurrences searched by user pitxtList = ['pitxt'] elemlenList = ['elemlen'] for index, occ in enumerate(occurrences): try: subtype = occ['querySubtype'] codList = occ['codList'] formCodList = occ.get('formCodList') except KeyError as err: raise KeyError('Missing required data for query: ' + str(err)) strCodList = ",".join(str(c) for c in codList) if subtype==0: condition = f" WHERE tab.cod IN ({strCodList})" elif subtype==1: condition = f" WHERE tab.indlem IN ({strCodList})" elif subtype==2: if formCodList is None: return None strFormCodList = ",".join(str(c) for c in formCodList) condition = f" WHERE tab.indlem IN ({strCodList}) OR (tab.indlem = 0 AND tab.cod IN ({strFormCodList}))" mainQueryString = f'{preMainQueryString} {condition}' # First occurrence: if index==0: # Create a temporary table for results resTable = 'tempOcc_' + str(index) connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS {mainQueryString}') connection.cursor().execute(f'CREATE INDEX aa_{index} ON {resTable} (ntx, mappa)') continue else: # update results connection.cursor().execute(f'CREATE TEMPORARY TABLE tempOccB AS {mainQueryString}') connection.cursor().execute(f'CREATE INDEX bb ON tempOccB (ntx, mappa)') oldTable = resTable resTable = 'tempOcc_' + str(index) connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS SELECT tabA.cod, tabA.ntx, tabA.{" tabA.".join(pitxtList)}, tabA.{" tabA.".join(elemlenList)}, tabA.mappa, tabA.numperiod, tabA.links, tabA.numorg, tabA.sigla, tabA.vol, tabA.pag, tabA.riga, tabA.col, tabA.tipostanza, tabA.stanza, tabA.verso, tabA.numbrano, tabA.lemma, tabA.cat_gr, tabA.disambiguatore, tabB.ntx AS ntx2, tabB.mappa AS mappa2, tabB.pitxt as pitxt_{index}, tabB.elemlen as elemlen_{index} FROM {oldTable} AS tabA, tempOccB AS tabB WHERE tabA.ntx=tabB.ntx AND tabA.mappa BETWEEN tabB.mappa-{intervallo} AND tabB.mappa+{intervallo} AND tabA.mappa != tabB.mappa') connection.cursor().execute(f'CREATE INDEX aa_{index} ON {resTable} (ntx, mappa)') connection.cursor().execute(f'DROP TABLE {oldTable}') pitxtList.append(f'pitxt_{index}') elemlenList.append(f'elemlen_{index}') results = pd.read_sql(f'SELECT * FROM {resTable}', connection) # This value can be changed to change multiple contexts width. Default value for Gatto is parole=31 parole = 31 # C'è la possibilità di scegliere periodi invece che parole, ma per il momento è disabilitata queryPiniz = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM {resTable} AS stuff LEFT JOIN {table} AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-{int(parole/2)}' queryPfin = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM {resTable} AS stuff LEFT JOIN {table} AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+{int(parole/2)}' queryPeriodi = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM {resTable} AS stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod' resultsPiniz = pd.read_sql(queryPiniz, connection) resultsPfin = pd.read_sql(queryPfin, connection) resultsPeriodi = pd.read_sql(queryPeriodi, connection) results['piniz'] = resultsPiniz['piniz'] results['pfin'] = resultsPfin['pfin'] results[['backup_piniz', 'backup_pfin']] = resultsPeriodi[['backup_piniz', 'backup_pfin']] return results # Dict factory non-Pandas queries def dict_factory(cursor, row): fields = [column[0] for column in cursor.description] return {key: value for key, value in zip(fields, row)}