Browse Source

Un miracolo!

kora 1 year ago
parent
commit
a8ef9eaf46

+ 30 - 31
flask_be/engine/cooccorrenze.py

@@ -13,40 +13,39 @@ class cooccorrenze(basicQueries):
         super().__init__(dataConfig)
 
     #%% funzione ricerca per cooccorrenze. 
-    # Ha in input un array del tipo [forma/lemma_cercati, tipo_ricerca, ricerca_espansa, iniziale_raddoppiata].
-    # l'attributo tipo_ricerca definisce il tipo di ricerca in input (0 per forme, 1 per lemmi, 2 per lemmi con opzione "mostra occorrenze non lemmatizzate").
+    # Ha in input un array di arrays del tipo:
+    # [forma/lemma_cercati, tipo_ricerca, ricerca_espansa, iniziale_raddoppiata].
+    # l'attributo tipo_ricerca ha come valori ammessi: 0 per forme, 1 per lemmi, 2 per lemmi + occorrenze non lemmatizzate.
     # Permette di definire l'intervallo di ricerca (in numero di parole), la possibilità di cercare soltanto all'interno dello stesso periodo (0/1) e/o di cercare le occorrenze in modo ordinato (0/1)
     def ricerca_cooccorrenze(self, listaricerche, intervallo, periodo, ordinate):
-        listatesti = pd.DataFrame()
-        cod = 1
-        if listaricerche[0][1] == 0:
-            ricerca = self.sendBasicQuery(listaricerche[0][0], 'forma', listaricerche[0][2], listaricerche[0][3], pandas=True)
-            listatesti = self.findtexts(0, ricerca)
-        elif listaricerche[0][1] == 1:
-            ricerca = self.sendBasicQuery(listaricerche[0][0], 'lemma', listaricerche[0][2], listaricerche[0][3], pandas=True)
-            listatesti = self.findtexts(1, ricerca)
-        elif listaricerche[0][1] == 2:
-            ricerca = self.sendBasicQuery(listaricerche[0][0], 'lemma', listaricerche[0][2], listaricerche[0][3], pandas=True)
-            listatesti = self.findtexts(2, ricerca)
-
-        if listatesti.empty:
+        occurrences = [] # una lista di Dicts con i codici dei lemmi/forme da cercare
+        for ricerca, tipo, espansa, raddoppiata in listaricerche:
+            if tipo==0:
+                res1 = self.sendBasicQuery(ricerca, 'forma', espansa, raddoppiata, pandas=True)
+                if res1.empty:
+                    return []
+                occurrences.append({'codList': list(res1['cod']), 'querySubtype': 0})
+            elif tipo==1:
+                res1 = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, pandas=True)
+                if res1.empty:
+                    return []
+                occurrences.append({'codList': list(res1['cod']), 'querySubtype': 1})
+            elif tipo==2:
+                res1 = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, pandas=True)
+                if res1.empty:
+                    return []
+                codList = list(res1['cod'])
+                subQueryData = {'queryType': 'pfl', 'codList': codList}
+                subdf = self.queryHandler.query(subQueryData, pandas=True)
+                formCodList = list(subdf['codForma'])
+                occurrences.append({'codList': codList, 'formCodList': formCodList, 'querySubtype': 2})
+
+        if len(occurrences)==0:
             return []
 
-        for ricerca, tipo, espansa, raddoppiata in listaricerche[1:]:
-            if tipo == 0:
-                search = self.sendBasicQuery(ricerca, 'forma', espansa, raddoppiata, pandas=True)
-            elif tipo == 1:
-                search = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, pandas=True)
-            elif tipo == 2:
-                search = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, pandas=True)
-
-            textlist = self.findtexts(tipo, search)
-
-            listatesti = listatesti.merge(textlist, on='ntx', suffixes=('', f'_{cod}'))
-            cond1 = listatesti['numperiod'] == listatesti[f'numperiod_{cod}'] if periodo == 1 else True
-            cond2 = ((listatesti['mappa'] - listatesti[f'mappa_{cod}']) != 0) & ((listatesti['mappa'] - listatesti[f'mappa_{cod}']).abs() <= intervallo) if ordinate == 0 else ((listatesti[f'mappa_{cod}'] - listatesti['mappa']) > 0) & ((listatesti[f'mappa_{cod}'] - listatesti['mappa']).abs() <= intervallo)
-            cod += 1
-            listatesti = listatesti[cond1 & cond2]
+        queryData = {'queryType': 'co-occurrences', 'occurrences': occurrences, 'intervallo': intervallo, 'periodo': periodo, 'ordinate': ordinate}
+        queryResponses = [self.queryHandler.query(dict(queryData, table=table), pandas=True) for table in self.listOcc]
+        listatesti = pd.concat(queryResponses)
 
         if listatesti.empty:
             return []
@@ -56,4 +55,4 @@ class cooccorrenze(basicQueries):
             clean = bibliocontexts.drop_duplicates(subset="contesto")
             highlights = formatAllContexts(clean)
 
-        return highlights.to_dict(orient='records')
+        return highlights.to_dict(orient='records')

+ 114 - 30
flask_be/interface_sqlite3/actual_queries.py

@@ -79,6 +79,10 @@ def prepareQuery(queryData):
     elif type=='texts':
         return complexQueryTexts
     
+    ###################
+    elif type=='co-occurrences':
+        return complexQueryCooccurrences
+
     ######################
     elif type=='bib':
         try:
@@ -154,42 +158,36 @@ def complexQueryTexts(connection, queryData):
         formCodList = queryData.get('formCodList') # KeyError-safe (None if absent)
     except KeyError as err:
         raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
-    # These values can be changed to changa multiple contexts widht. Default value for Gatto is parole=31 #
+    
+    strCodList = ",".join(str(c) for c in codList)
+
+    # Main query, verified to be fast!
+    mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod"
+    if subtype==0:
+        condition = f"WHERE tab.cod IN ({strCodList})"
+    elif subtype==1:
+        condition = f"WHERE tab.indlem IN ({strCodList})"
+    elif subtype==2:
+        if formCodList is None:
+            return None
+        strFormCodList = ",".join(str(c) for c in formCodList)
+        condition = f" WHERE tab.indlem IN ({strCodList}) OR (tab.indlem = 0 AND tab.cod IN ({strFormCodList}))"
+
+    mainQueryString = f'{mainQueryString} {condition}'
+
+    # This value can be changed to change multiple contexts width. Default value for Gatto is parole=31 #
     parole = 31
-    periodi = 0
-    #                                                                                                     #
-    strlist = ",".join(str(c) for c in codList)
+    # C'è la possibilità di scegliere periodi invece che parole, ma per il momento è disabilitata
+    createTempTable = f'CREATE TEMPORARY TABLE stuff AS {mainQueryString}' 
+    mainQuery = f'SELECT * from stuff'
+    addQuery1 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-{int(parole/2)}'
+    addQuery2 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+{int(parole/2)}'
+    addQuery3 = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
 
-    mainQueryString = ""
-    if parole != 0:
-        if subtype==0:
-            mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.cod IN ({strlist})"
-        elif subtype==1:
-            mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist})"
-        elif subtype==2:
-            if formCodList is None:
-                return None
-            strform = ",".join(str(c) for c in formCodList)
-            mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist}) OR (tab.indlem = 0 AND tab.cod IN ({strform}))"
-    else:
-        if subtype==0:
-            mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.cod IN ({strlist})"
-        elif subtype==1:
-            mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist})"
-        elif subtype==2:
-            if formCodList is None:
-                return None
-            strform = ",".join(str(c) for c in formCodList)
-            mainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist}) OR (tab.indlem = 0 AND tab.cod IN ({strform}))"
 
     # Start communication with DB
-    createTempTable = f'CREATE TEMPORARY TABLE stuff AS {mainQueryString}' 
     connection.cursor().execute(createTempTable)
 
-    mainQuery = f'SELECT * from stuff'
-    addQuery1 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-15'
-    addQuery2 = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM stuff LEFT JOIN Occ00001 AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+15'
-    addQuery3 = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
     results = pd.read_sql(mainQuery, connection)
     results_add1 = pd.read_sql(addQuery1, connection)
     results_add2 = pd.read_sql(addQuery2, connection)
@@ -199,3 +197,89 @@ def complexQueryTexts(connection, queryData):
     results[['backup_piniz', 'backup_pfin']] = results_add3[['backup_piniz', 'backup_pfin']]
 
     return results
+
+
+def complexQueryCooccurrences(connection, queryData):
+    try:
+        # the get method for dicts is KeyError-safe (returns None if key is absent)
+        occurrences = queryData['occurrences']
+        table = queryData['table']
+        intervallo = queryData['intervallo']
+        periodo = queryData.get('periodo') # Unused for the moment
+        ordinate = queryData.get('ordinate') # Unused for the moment
+        if periodo is None:
+            periodo = 0
+        if ordinate is None:
+            ordinate = 0
+    except KeyError as err:
+        raise KeyError('Missing required data for query: ' + str(err))
+
+
+    # Main part of main query -- verified to be fast!
+    preMainQueryString = f"SELECT tab.cod, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod"
+
+    # Main loop on the different occurrences searched by user
+    pitxtList = ['pitxt']
+    elemlenList = ['elemlen']
+    for index, occ in enumerate(occurrences):
+
+        try:
+            subtype = occ['querySubtype']
+            codList = occ['codList']
+            formCodList = occ.get('formCodList') 
+        except KeyError as err:
+            raise KeyError('Missing required data for query: ' + str(err))
+        
+        strCodList = ",".join(str(c) for c in codList)
+
+        if subtype==0:
+            condition = f" WHERE tab.cod IN ({strCodList})"
+        elif subtype==1:
+            condition = f" WHERE tab.indlem IN ({strCodList})"
+        elif subtype==2:
+            if formCodList is None:
+                return None
+            strFormCodList = ",".join(str(c) for c in formCodList)
+            condition = f" WHERE tab.indlem IN ({strCodList}) OR (tab.indlem = 0 AND tab.cod IN ({strFormCodList}))"
+
+        mainQueryString = f'{preMainQueryString} {condition}'
+
+        # First occurrence:
+        if index==0:
+            # Create a temporary table for results
+            resTable = 'tempOcc_' + str(index)
+            connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS {mainQueryString}')
+            connection.cursor().execute(f'CREATE INDEX aa_{index} ON {resTable} (ntx, mappa)')
+            continue
+        
+        else:
+            # update results
+            connection.cursor().execute(f'CREATE TEMPORARY TABLE tempOccB AS {mainQueryString}')
+            connection.cursor().execute(f'CREATE INDEX bb ON tempOccB (ntx, mappa)')
+
+            oldTable = resTable
+            resTable = 'tempOcc_' + str(index)
+            connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS SELECT tabA.cod, tabA.ntx, tabA.{" tabA.".join(pitxtList)}, tabA.{" tabA.".join(elemlenList)}, tabA.mappa, tabA.numperiod, tabA.links, tabA.numorg, tabA.sigla, tabA.vol, tabA.pag, tabA.riga, tabA.col, tabA.tipostanza, tabA.stanza, tabA.verso, tabA.numbrano, tabA.lemma, tabA.cat_gr, tabA.disambiguatore, tabB.ntx AS ntx2, tabB.mappa AS mappa2, tabB.pitxt as pitxt_{index}, tabB.elemlen as elemlen_{index} FROM {oldTable} AS tabA, tempOccB AS tabB WHERE tabA.ntx=tabB.ntx AND tabA.mappa BETWEEN tabB.mappa-{intervallo} AND tabB.mappa+{intervallo} AND tabA.mappa != tabB.mappa')
+            connection.cursor().execute(f'CREATE INDEX aa_{index} ON {resTable} (ntx, mappa)')
+            connection.cursor().execute(f'DROP TABLE {oldTable}')
+            pitxtList.append(f'pitxt_{index}')
+            elemlenList.append(f'elemlen_{index}')
+
+
+    results = pd.read_sql(f'SELECT * FROM {resTable}', connection)
+
+    # This value can be changed to change multiple contexts width. Default value for Gatto is parole=31
+    parole = 31
+    # C'è la possibilità di scegliere periodi invece che parole, ma per il momento è disabilitata
+    queryPiniz = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS piniz FROM {resTable} AS stuff LEFT JOIN {table} AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa-{int(parole/2)}'
+    queryPfin = f'SELECT stuff.ntx, stuff.mappa, tab.pitxt AS pfin FROM {resTable} AS stuff LEFT JOIN {table} AS tab ON tab.ntx=stuff.ntx AND tab.mappa=stuff.mappa+{int(parole/2)}'
+    queryPeriodi = f'SELECT stuff.ntx, stuff.numperiod, periodi.piniz AS backup_piniz, periodi.pfin AS backup_pfin FROM {resTable} AS stuff, periodi WHERE stuff.ntx = periodi.ntx AND stuff.numperiod = periodi.numperiod'
+
+    resultsPiniz = pd.read_sql(queryPiniz, connection)
+    resultsPfin = pd.read_sql(queryPfin, connection)
+    resultsPeriodi = pd.read_sql(queryPeriodi, connection)
+    results['piniz'] = resultsPiniz['piniz']
+    results['pfin'] = resultsPfin['pfin']
+    results[['backup_piniz', 'backup_pfin']] = resultsPeriodi[['backup_piniz', 'backup_pfin']]
+
+    return results

+ 16 - 7
test_suite/tests_kora_misc/Query_speed/queries_2.py

@@ -98,8 +98,7 @@ timestamp2 = time.time()
 
 print(timestamp2-timestamp0)
 # %%
-res
-# %%
+# Prova una cooccorrenza
 str1 = 'prima'
 strEnc1 = db_encode(vettSpec, str1)
 
@@ -114,18 +113,28 @@ formCodes1 = [res['cod'] for res in formsNoPandas1]
 formCodesStr1 = [str(code) for code in formCodes1]
 # %%
 timestamp0 = time.time()
+ind = 0
 with sqlite3.connect(f"file:{dbFile}?mode=ro", uri=True) as connection:
     mainQueryString = theMainQuery(','.join(lemCodesStr0), ','.join(formCodesStr0))
 
     otherQueryString = theMainQuery0(','.join(formCodesStr1))
 
-    connection.cursor().execute(f'CREATE TEMPORARY TABLE tempOccA AS {mainQueryString}')
-    connection.cursor().execute(f'CREATE INDEX aa ON tempOccA (ntx, mappa)')
+    resTable = 'tempOcc_' + str(ind)
+    connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS {mainQueryString}')
+    connection.cursor().execute(f'CREATE INDEX aa_{ind} ON {resTable} (ntx, mappa)')
+
+    connection.cursor().execute(f'CREATE TEMPORARY TABLE tempOcc AS {otherQueryString}')
+    connection.cursor().execute(f'CREATE INDEX bb ON tempOcc (ntx, mappa)')
+
+    ind += 1
+    oldTable = resTable
+    resTable = 'tempOcc_' + str(ind)
+    connection.cursor().execute(f'CREATE TEMPORARY TABLE {resTable} AS SELECT tabA.ntx as ntx, tabA.mappa as mappa, tabB.ntx as ntx2, tabB.mappa as mappa2 FROM {oldTable} AS tabA, tempOcc AS tabB WHERE tabA.ntx=tabB.ntx AND tabA.mappa BETWEEN tabB.mappa-10 AND tabB.mappa+10 AND tabA.mappa != tabB.mappa')
+    connection.cursor().execute(f'CREATE INDEX aa_{ind} ON {resTable} (ntx, mappa)')
+    connection.cursor().execute(f'DROP TABLE {oldTable}')
 
-    connection.cursor().execute(f'CREATE TEMPORARY TABLE tempOccB AS {otherQueryString}')
-    connection.cursor().execute(f'CREATE INDEX bb ON tempOccB (ntx, mappa)')
+    res = pd.read_sql(f'SELECT * FROM {resTable}', connection)
 
-    res = pd.read_sql('SELECT tabA.ntx, tabA.mappa, tabB.ntx, tabB.mappa FROM tempOccA AS tabA, tempOccB AS tabB WHERE tabA.ntx=tabB.ntx AND tabA.mappa BETWEEN tabB.mappa-10 AND tabB.mappa+10 AND tabA.mappa != tabB.mappa', connection)
 
 timestamp1 = time.time()
 print(timestamp1 - timestamp0)