Browse Source

subcorpus fully working on test

Leonardo Canova 1 year ago
parent
commit
833b471c2b
2 changed files with 82 additions and 92 deletions
  1. 80 90
      test_suite/test/simple_query_test_pandas.py
  2. 2 2
      test_suite/test/test_sottocorpora.py

+ 80 - 90
test_suite/test/simple_query_test_pandas.py

@@ -11,7 +11,47 @@ import pandas as pd
 import dtale
 import unicodedata
 import sys
-
+#from test_occorrenzario_pandas import findtexts, get_tables_occ
+from test_sottocorpora import definiscisottocorpus
+
+def get_tables_occ(path):
+    conn = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
+    cursor = conn.cursor()
+    cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
+    table_names = cursor.fetchall()
+    occ_tables = [table[0] for table in table_names if table[0].startswith('Occ')]
+    cursor.close()
+    conn.close()
+    return occ_tables
+
+#%% ha in input le funzioni di ricerca, cerca nell'occorrenziario i puntatori ai contesti e altri elementi ad essi associati. 
+#l'attributo type definisce il tipo di ricerca in input (0 per forme, 1 per lemmi o categoria grammaticale, 2 per lemmi con opzione "mostra occorrenze non lemmatizzate")
+def findtexts(type, df, listOcc, path, subcorpus=None):
+    textlist = pd.DataFrame()
+    codlist = list(df["cod"])
+    con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
+    for table in listOcc:
+        strlist = ",".join(str(c) for c in codlist)
+        if type == 0:
+            Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.cod IN ({strlist})"
+            extendequeryReponse = pd.read_sql(Query, con)
+            textlist = pd.concat([textlist, extendequeryReponse])
+        elif type == 1:
+            Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist})"
+            extendequeryReponse = pd.read_sql(Query, con)
+            textlist = pd.concat([textlist, extendequeryReponse])
+        elif type == 2:
+            subquery = f"SELECT DISTINCT lemma, forma FROM pfl WHERE lemma IN ({strlist})"
+            subdf = pd.read_sql(subquery, con)
+            formcodlist = list(subdf["forma"])
+            strform = ",".join(str(c) for c in formcodlist)
+            Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist}) OR (tab.indlem = 0 AND tab.cod IN ({strform}))"
+            extendequeryReponse = pd.read_sql(Query, con)
+            textlist = pd.concat([textlist, extendequeryReponse])
+    if subcorpus is not None:
+            filter = textlist['sigla'].isin(subcorpus)
+            textlist = textlist[filter]
+    return textlist
 
 #%% funzione combinazioni <> è chiamata da interpreter
 def combinations(s):
@@ -55,7 +95,7 @@ def counter (results):
 
 
 #%% Funzione ricerca per forme
-def ricercaforme (entries, path, espansa, raddoppiata):
+def ricercaforme (entries, path, espansa, raddoppiata, subcorpus=None):
     if espansa == 0: 
         data=" OR spec LIKE ".join(entries)
         doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
@@ -65,14 +105,6 @@ def ricercaforme (entries, path, espansa, raddoppiata):
         else:
             theSimpleQuery = f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {data} ORDER BY idfor"
 
-        con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        if answer_table.empty:
-            print ("Nessun risultato")
-            sys.exit(1)
-        else:
-            return answer_table
-
     else:
 
         data=" OR spec LIKE ".join(entries)
@@ -85,45 +117,25 @@ def ricercaforme (entries, path, espansa, raddoppiata):
         else:
             theSimpleQuery = f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idfor"
 
-        con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        if answer_table.empty:
-            print ("Nessun risultato")
-            sys.exit(1)
-        else:
-            return answer_table
-    #deprecated
-    """if espansa == 0:
-
-        data=" OR spec LIKE ".join(entries)
-        doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
-
-        if raddoppiata == 1: 
-            theSimpleQuery = "SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE " + data + " OR spec LIKE " + doubleddata + "ORDER BY idfor"
-        else:
-            theSimpleQuery = "SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE " + data + " ORDER BY idfor"
-
-        con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        return answer_table
-
+    con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
+    answer_table = pd.read_sql(theSimpleQuery, con)
+    if answer_table.empty:
+        print ("Nessun risultato")
+        sys.exit(1)
     else:
-
-        data=" OR spec LIKE ".join(entries)
-        data_norm=" OR norm LIKE ".join(list_normalize(entries))
-        doubleddata_norm=" OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
-        doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
-
-        if raddoppiata == 1:
-            theSimpleQuery = "SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ") OR (spec LIKE " + doubleddata + ") OR (norm LIKE " + doubleddata_norm + ")" + " ORDER BY idfor"
+        if subcorpus == None:
+            return answer_table
         else:
-            theSimpleQuery = "SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ")" + " ORDER BY idfor"
-
-        con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        return answer_table"""
+            listOcc = get_tables_occ(path)
+            textlist = findtexts(0, answer_table, listOcc, path, subcorpus)
+            counts = textlist.groupby('cod').size().reset_index(name='count')
+            answer_table = pd.merge(answer_table, counts, on='cod', how='left')
+            answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int)
+            answer_table = answer_table.dropna(subset=['count'])
+            return answer_table
+        
 #%% Funzione ricerca per lemmi
-def ricercalemmi (entries, path, espansa, raddoppiata):
+def ricercalemmi (entries, path, espansa, raddoppiata, subcorpus=None):
     if espansa == 0:
         data = " OR spec LIKE ".join(entries)
         doubleddata = " OR spec LIKE ".join(inizialeraddoppiata(entries))
@@ -133,15 +145,8 @@ def ricercalemmi (entries, path, espansa, raddoppiata):
         else:
             theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {data} ORDER BY idlem"
 
-        con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        if answer_table.empty:
-            print ("Nessun risultato")
-            sys.exit(1)
-        else:
-            return answer_table
-
     else:
+
         data = " OR spec LIKE ".join(entries)
         data_norm = " OR norm LIKE ".join(list_normalize(entries))
         doubleddata_norm = " OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
@@ -152,43 +157,23 @@ def ricercalemmi (entries, path, espansa, raddoppiata):
         else:
             theSimpleQuery = f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idlem"
 
-        con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        if answer_table.empty:
-            print ("Nessun risultato")
-            sys.exit(1)
+    con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
+    answer_table = pd.read_sql(theSimpleQuery, con)
+    if answer_table.empty:
+        print ("Nessun risultato")
+        sys.exit(1)
+    else:
+        if subcorpus == None:
+            return answer_table
         else:
+            listOcc = get_tables_occ(path)
+            textlist = findtexts(1, answer_table, listOcc, path, subcorpus)
+            counts = textlist.groupby('indlem').size().reset_index(name='count')
+            answer_table = pd.merge(answer_table, counts, left_on='cod', right_on='indlem')
+            answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int)
+            answer_table = answer_table.dropna(subset=['count'])
             return answer_table
-    #deprecated
-    """if espansa == 0:
-
-        data=" OR spec LIKE ".join(entries)
-        doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
         
-        if raddoppiata == 1:
-            theSimpleQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE " + data  + " OR spec LIKE " + doubleddata + "ORDER BY idlem"
-        else:
-            theSimpleQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE " + data + " ORDER BY idlem"
-
-        con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        return answer_table
-
-    else:
-
-        data=" OR spec LIKE ".join(entries)
-        data_norm=" OR norm LIKE ".join(list_normalize(entries))
-        doubleddata_norm=" OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
-        doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
-
-        if raddoppiata == 1:
-            theSimpleQuery = "SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ") OR (spec LIKE " + doubleddata + ") OR (norm LIKE " + doubleddata_norm + ")" + " ORDER BY idlem"
-        else:
-            theSimpleQuery = "SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ")" + " ORDER BY idlem"
-
-        con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
-        answer_table = pd.read_sql(theSimpleQuery, con)
-        return answer_table"""
 #%% Funzione ricerca di forme con vista lemmi
 def ricercaformelemmi (entries, path, espansa, raddoppiata):
     if espansa == 0:
@@ -328,9 +313,14 @@ def ricercacatgr (entry, path):
 
 #%% 
 path = "/Users/leonardocanova/Library/CloudStorage/OneDrive-ConsiglioNazionaledelleRicerche/TIGRO/Ricerche/db/first_db"
-entry = "filius"
+entry = "come"
+conditions = {"IQ": "TS", "[Titolo Abbreviato]": ["Simintendi, a. 1333 (prat.)"]}
+conditions2 = {"[Autore]": ["Dante Alighieri", "Boccaccio, Giovanni"]}
+chronoconditions = {"[Anno iniziale]": 1320, "[Anno finale]": 1375}
+subcorpus = definiscisottocorpus(path,"datibib", conditions)
+print(subcorpus)
 #df=ricercacatgr(entry, path)
-df=ricercaforme(interpreter(entry), path, 0, 0)
+df=ricercalemmi(interpreter(entry), path, 0, 0, subcorpus)
 dtale.show(df)
 # %%
  

+ 2 - 2
test_suite/test/test_sottocorpora.py

@@ -3,8 +3,8 @@ import sqlite3
 import pandas as pd
 import dtale
 import unicodedata
-from simple_query_test_pandas import ricercaforme, ricercalemmi, ricercaformelemmi, ricercalemmiforme, interpreter
-from test_occorrenzario_pandas import findtexts, findcontexts, findbib, contestimultipli, get_tables_occ
+#from simple_query_test_pandas import ricercaforme, ricercalemmi, ricercaformelemmi, ricercalemmiforme, interpreter
+#from test_occorrenzario_pandas import findtexts, findcontexts, findbib, contestimultipli, get_tables_occ
 
 #funzione di definizione sottocorpus
 def definiscisottocorpus(path, table, conditions=None):