# Questo NON è parte del codice: è un notebook Jupyter (nell'implementazione di VSCode) # che ho usato per fare dei test! # %% # Test code using Jupyter # %% import sqlite3 import re import pandas as pd import dtale import unicodedata import sys #from test_occorrenzario_pandas import findtexts, get_tables_occ from test_sottocorpora import definiscisottocorpus def get_tables_occ(path): conn = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) cursor = conn.cursor() cursor.execute("SELECT name FROM sqlite_master WHERE type='table';") table_names = cursor.fetchall() occ_tables = [table[0] for table in table_names if table[0].startswith('Occ')] cursor.close() conn.close() return occ_tables #%% ha in input le funzioni di ricerca, cerca nell'occorrenziario i puntatori ai contesti e altri elementi ad essi associati. #l'attributo type definisce il tipo di ricerca in input (0 per forme, 1 per lemmi o categoria grammaticale, 2 per lemmi con opzione "mostra occorrenze non lemmatizzate") def findtexts(type, df, listOcc, path, subcorpus=None): textlist = pd.DataFrame() codlist = list(df["cod"]) con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) for table in listOcc: strlist = ",".join(str(c) for c in codlist) if type == 0: Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.cod IN ({strlist})" extendequeryReponse = pd.read_sql(Query, con) textlist = pd.concat([textlist, extendequeryReponse]) elif type == 1: Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist})" extendequeryReponse = pd.read_sql(Query, con) textlist = pd.concat([textlist, extendequeryReponse]) elif type == 2: subquery = f"SELECT DISTINCT lemma, forma FROM pfl WHERE lemma IN ({strlist})" subdf = pd.read_sql(subquery, con) formcodlist = list(subdf["forma"]) strform = ",".join(str(c) for c in formcodlist) Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist}) OR (tab.indlem = 0 AND tab.cod IN ({strform}))" extendequeryReponse = pd.read_sql(Query, con) textlist = pd.concat([textlist, extendequeryReponse]) if subcorpus is not None: filter = textlist['sigla'].isin(subcorpus) textlist = textlist[filter] return textlist #%% funzione combinazioni <> è chiamata da interpreter def combinations(s): result = [] start = s.find("<") end = s.find(">") if start == -1 or end == -1: return [s] items = s[start + 1:end].split(",") for item in items: result.extend([s[:start] + item + rest for rest in combinations(s[end + 1:])]) return result #%% funzione interprete, sta alla base di ogni ricerca def interpreter (data): clean_data= "'"+data.replace("*", "%").replace("?", "_").replace(" ","").replace("'", "''").replace("’", "''") +"'" return combinations(clean_data) # %% funzione iniziale raddoppiata, è chiamata dalle funzioni di ricerca con iniziale raddoppiata def inizialeraddoppiata (data): doubleddata=[] for el in data: if el[1] != "%" and "_": doubleddata = doubleddata + ["'"+ el[1] + el[1:]] return doubleddata # %% funzione normalizza stringa (ricerca espansa), è chiamata dalle funzioni di ricerca espansa def normalize(stringa): return unicodedata.normalize('NFKD', stringa).encode('ASCII', 'ignore').decode('utf-8') def list_normalize(lista): return [normalize(stringa) for stringa in lista] # %% funzione counter, può essere chiamata sui risultati delle ricerche per visualizzare le forme/lemmi e il numero di occorrenze individuate def counter (results): if not results.empty: trovati= len(results.index) occorrenze= results['occ'].sum() return ("Trovati=" + str(trovati) + " Occorrenze=" + str(occorrenze)) #%% Funzione ricerca per forme def ricercaforme (entries, path, espansa, raddoppiata, subcorpus=None): if espansa == 0: data=" OR spec LIKE ".join(entries) doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {data} OR spec LIKE {doubleddata} ORDER BY idfor" else: theSimpleQuery = f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {data} ORDER BY idfor" else: data=" OR spec LIKE ".join(entries) data_norm=" OR norm LIKE ".join(list_normalize(entries)) doubleddata_norm=" OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries))) doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) OR (spec LIKE {doubleddata}) OR (norm LIKE {doubleddata_norm}) ORDER BY idfor" else: theSimpleQuery = f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idfor" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: if subcorpus == None: return answer_table else: listOcc = get_tables_occ(path) textlist = findtexts(0, answer_table, listOcc, path, subcorpus) counts = textlist.groupby('cod').size().reset_index(name='count') answer_table = pd.merge(answer_table, counts, on='cod', how='left') answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int) answer_table = answer_table.dropna(subset=['count']) return answer_table #%% Funzione ricerca per lemmi def ricercalemmi (entries, path, espansa, raddoppiata, subcorpus=None): if espansa == 0: data = " OR spec LIKE ".join(entries) doubleddata = " OR spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {data} OR spec LIKE {doubleddata} ORDER BY idlem" else: theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {data} ORDER BY idlem" else: data = " OR spec LIKE ".join(entries) data_norm = " OR norm LIKE ".join(list_normalize(entries)) doubleddata_norm = " OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries))) doubleddata = " OR spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) OR (spec LIKE {doubleddata}) OR (norm LIKE {doubleddata_norm}) ORDER BY idlem" else: theSimpleQuery = f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idlem" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: if subcorpus == None: return answer_table else: listOcc = get_tables_occ(path) textlist = findtexts(1, answer_table, listOcc, path, subcorpus) counts = textlist.groupby('indlem').size().reset_index(name='count') answer_table = pd.merge(answer_table, counts, left_on='cod', right_on='indlem') answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int) answer_table = answer_table.dropna(subset=['count']) return answer_table #%% Funzione ricerca di forme con vista lemmi def ricercaformelemmi (entries, path, espansa, raddoppiata): if espansa == 0: data = " OR form.spec LIKE ".join(entries) doubleddata = " OR form.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE {data} OR form.spec LIKE {doubleddata} ORDER BY form.idfor" else: theSimpleQuery = f"SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE {data} ORDER BY form.idfor" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: return answer_table else: data = " OR form.spec LIKE ".join(entries) data_norm = " OR form.norm LIKE ".join(list_normalize(entries)) doubleddata_norm = " OR form.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries))) doubleddata = " OR form.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE {data}) OR (form.norm LIKE {data_norm}) OR (form.spec LIKE {doubleddata}) OR (form.norm LIKE {doubleddata_norm}) ORDER BY form.idfor" else: theSimpleQuery = f"SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE {data}) OR (form.norm LIKE {data_norm}) ORDER BY form.idfor" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: return answer_table #deprecated """if espansa == 0: data=" OR form.spec LIKE ".join(entries) doubleddata=" OR form.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = "SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE " + data + " OR form.spec LIKE " + doubleddata + " ORDER BY form.idfor" else: theSimpleQuery = "SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE " + data + " ORDER BY form.idfor" con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) return answer_table else: data=" OR form.spec LIKE ".join(entries) data_norm=" OR form.norm LIKE ".join(list_normalize(entries)) doubleddata_norm=" OR form.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries))) doubleddata=" OR form.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = "SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE " + data +") OR (form.norm LIKE " + data_norm + ") OR (form.spec LIKE " + doubleddata + ") OR (form.norm LIKE " + doubleddata_norm + ")" + " ORDER BY form.idfor" else: theSimpleQuery = "SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE " + data +") OR (form.norm LIKE " + data_norm + ")" + " ORDER BY form.idfor" con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) return answer_table""" #%% Funzione ricerca lemmi con vista forme def ricercalemmiforme (entries, path, espansa, raddoppiata): if espansa == 0: data = " OR form.spec LIKE ".join(entries) doubleddata = " OR form.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE {data} OR form.spec LIKE {doubleddata} ORDER BY lem.idlem" else: theSimpleQuery = f"SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE {data} ORDER BY lem.idlem" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: return answer_table else: data = " OR lem.spec LIKE ".join(entries) data_norm = " OR lem.norm LIKE ".join(list_normalize(entries)) doubleddata_norm = " OR lem.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries))) doubleddata = " OR lem.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = f"SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE {data}) OR (lem.norm LIKE {data_norm}) OR (lem.spec LIKE {doubleddata}) OR (lem.norm LIKE {doubleddata_norm}) ORDER BY lem.idlem" else: theSimpleQuery = f"SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE {data}) OR (lem.norm LIKE {data_norm}) ORDER BY lem.idlem" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: return answer_table #deprecated """if espansa == 0: data=" OR form.spec LIKE ".join(entries) doubleddata=" OR form.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = "SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE " + data + " OR form.spec LIKE " + doubleddata + " ORDER BY lem.idlem" else: theSimpleQuery = "SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma,lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE " + data + " ORDER BY lem.idlem" con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) return answer_table else: data=" OR lem.spec LIKE ".join(entries) data_norm=" OR lem.norm LIKE ".join(list_normalize(entries)) doubleddata_norm=" OR lem.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries))) doubleddata=" OR lem.spec LIKE ".join(inizialeraddoppiata(entries)) if raddoppiata == 1: theSimpleQuery = "SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE " + data +") OR (lem.norm LIKE " + data_norm + ") OR (lem.spec LIKE " + doubleddata + ") OR (lem.norm LIKE " + doubleddata_norm + ")" + " ORDER BY lem.idlem" else: theSimpleQuery = "SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE " + data +") OR (lem.norm LIKE " + data_norm + ")" + " ORDER BY lem.idlem" con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) return answer_table""" # %% Ricerca per categorie grammaticali def ricercacatgr (entry, path): theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE cat = '{entry}' ORDER BY idlem" con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True) answer_table = pd.read_sql(theSimpleQuery, con) if answer_table.empty: print ("Nessun risultato") sys.exit(1) else: return answer_table #%% path = "/Users/leonardocanova/Library/CloudStorage/OneDrive-ConsiglioNazionaledelleRicerche/TIGRO/Ricerche/db/first_db" entry = "come" conditions = {"IQ": "TS", "[Titolo Abbreviato]": ["Simintendi, a. 1333 (prat.)"]} conditions2 = {"[Autore]": ["Dante Alighieri", "Boccaccio, Giovanni"]} chronoconditions = {"[Anno iniziale]": 1320, "[Anno finale]": 1375} subcorpus = definiscisottocorpus(path,"datibib", conditions) print(subcorpus) #df=ricercacatgr(entry, path) df=ricercalemmi(interpreter(entry), path, 0, 0, subcorpus) dtale.show(df) # %%