123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326 |
- # Questo NON è parte del codice: è un notebook Jupyter (nell'implementazione di VSCode)
- # che ho usato per fare dei test!
- # %%
- # Test code using Jupyter
- # %%
- import sqlite3
- import re
- import pandas as pd
- import dtale
- import unicodedata
- import sys
- #from test_occorrenzario_pandas import findtexts, get_tables_occ
- from test_sottocorpora import definiscisottocorpus
- def get_tables_occ(path):
- conn = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- cursor = conn.cursor()
- cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
- table_names = cursor.fetchall()
- occ_tables = [table[0] for table in table_names if table[0].startswith('Occ')]
- cursor.close()
- conn.close()
- return occ_tables
- #%% ha in input le funzioni di ricerca, cerca nell'occorrenziario i puntatori ai contesti e altri elementi ad essi associati.
- #l'attributo type definisce il tipo di ricerca in input (0 per forme, 1 per lemmi o categoria grammaticale, 2 per lemmi con opzione "mostra occorrenze non lemmatizzate")
- def findtexts(type, df, listOcc, path, subcorpus=None):
- textlist = pd.DataFrame()
- codlist = list(df["cod"])
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- for table in listOcc:
- strlist = ",".join(str(c) for c in codlist)
- if type == 0:
- Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.cod IN ({strlist})"
- extendequeryReponse = pd.read_sql(Query, con)
- textlist = pd.concat([textlist, extendequeryReponse])
- elif type == 1:
- Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist})"
- extendequeryReponse = pd.read_sql(Query, con)
- textlist = pd.concat([textlist, extendequeryReponse])
- elif type == 2:
- subquery = f"SELECT DISTINCT lemma, forma FROM pfl WHERE lemma IN ({strlist})"
- subdf = pd.read_sql(subquery, con)
- formcodlist = list(subdf["forma"])
- strform = ",".join(str(c) for c in formcodlist)
- Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist}) OR (tab.indlem = 0 AND tab.cod IN ({strform}))"
- extendequeryReponse = pd.read_sql(Query, con)
- textlist = pd.concat([textlist, extendequeryReponse])
- if subcorpus is not None:
- filter = textlist['sigla'].isin(subcorpus)
- textlist = textlist[filter]
- return textlist
- #%% funzione combinazioni <> è chiamata da interpreter
- def combinations(s):
- result = []
- start = s.find("<")
- end = s.find(">")
- if start == -1 or end == -1:
- return [s]
- items = s[start + 1:end].split(",")
- for item in items:
- result.extend([s[:start] + item + rest for rest in combinations(s[end + 1:])])
- return result
- #%% funzione interprete, sta alla base di ogni ricerca
- def interpreter (data):
- clean_data= "'"+data.replace("*", "%").replace("?", "_").replace(" ","").replace("'", "''").replace("’", "''") +"'"
- return combinations(clean_data)
-
- # %% funzione iniziale raddoppiata, è chiamata dalle funzioni di ricerca con iniziale raddoppiata
- def inizialeraddoppiata (data):
- doubleddata=[]
- for el in data:
- if el[1] != "%" and "_":
- doubleddata = doubleddata + ["'"+ el[1] + el[1:]]
- return doubleddata
- # %% funzione normalizza stringa (ricerca espansa), è chiamata dalle funzioni di ricerca espansa
- def normalize(stringa):
- return unicodedata.normalize('NFKD', stringa).encode('ASCII', 'ignore').decode('utf-8')
- def list_normalize(lista):
- return [normalize(stringa) for stringa in lista]
- # %% funzione counter, può essere chiamata sui risultati delle ricerche per visualizzare le forme/lemmi e il numero di occorrenze individuate
- def counter (results):
- if not results.empty:
- trovati= len(results.index)
- occorrenze= results['occ'].sum()
- return ("Trovati=" + str(trovati) + " Occorrenze=" + str(occorrenze))
- #%% Funzione ricerca per forme
- def ricercaforme (entries, path, espansa, raddoppiata, subcorpus=None):
- if espansa == 0:
- data=" OR spec LIKE ".join(entries)
- doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
-
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {data} OR spec LIKE {doubleddata} ORDER BY idfor"
- else:
- theSimpleQuery = f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {data} ORDER BY idfor"
- else:
- data=" OR spec LIKE ".join(entries)
- data_norm=" OR norm LIKE ".join(list_normalize(entries))
- doubleddata_norm=" OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
- doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) OR (spec LIKE {doubleddata}) OR (norm LIKE {doubleddata_norm}) ORDER BY idfor"
- else:
- theSimpleQuery = f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idfor"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- if subcorpus == None:
- return answer_table
- else:
- listOcc = get_tables_occ(path)
- textlist = findtexts(0, answer_table, listOcc, path, subcorpus)
- counts = textlist.groupby('cod').size().reset_index(name='count')
- answer_table = pd.merge(answer_table, counts, on='cod', how='left')
- answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int)
- answer_table = answer_table.dropna(subset=['count'])
- return answer_table
-
- #%% Funzione ricerca per lemmi
- def ricercalemmi (entries, path, espansa, raddoppiata, subcorpus=None):
- if espansa == 0:
- data = " OR spec LIKE ".join(entries)
- doubleddata = " OR spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {data} OR spec LIKE {doubleddata} ORDER BY idlem"
- else:
- theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {data} ORDER BY idlem"
- else:
- data = " OR spec LIKE ".join(entries)
- data_norm = " OR norm LIKE ".join(list_normalize(entries))
- doubleddata_norm = " OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
- doubleddata = " OR spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) OR (spec LIKE {doubleddata}) OR (norm LIKE {doubleddata_norm}) ORDER BY idlem"
- else:
- theSimpleQuery = f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idlem"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- if subcorpus == None:
- return answer_table
- else:
- listOcc = get_tables_occ(path)
- textlist = findtexts(1, answer_table, listOcc, path, subcorpus)
- counts = textlist.groupby('indlem').size().reset_index(name='count')
- answer_table = pd.merge(answer_table, counts, left_on='cod', right_on='indlem')
- answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int)
- answer_table = answer_table.dropna(subset=['count'])
- return answer_table
-
- #%% Funzione ricerca di forme con vista lemmi
- def ricercaformelemmi (entries, path, espansa, raddoppiata):
- if espansa == 0:
- data = " OR form.spec LIKE ".join(entries)
- doubleddata = " OR form.spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE {data} OR form.spec LIKE {doubleddata} ORDER BY form.idfor"
- else:
- theSimpleQuery = f"SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE {data} ORDER BY form.idfor"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- return answer_table
- else:
- data = " OR form.spec LIKE ".join(entries)
- data_norm = " OR form.norm LIKE ".join(list_normalize(entries))
- doubleddata_norm = " OR form.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
- doubleddata = " OR form.spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE {data}) OR (form.norm LIKE {data_norm}) OR (form.spec LIKE {doubleddata}) OR (form.norm LIKE {doubleddata_norm}) ORDER BY form.idfor"
- else:
- theSimpleQuery = f"SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE {data}) OR (form.norm LIKE {data_norm}) ORDER BY form.idfor"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- return answer_table
- #deprecated
- """if espansa == 0:
- data=" OR form.spec LIKE ".join(entries)
- doubleddata=" OR form.spec LIKE ".join(inizialeraddoppiata(entries))
-
- if raddoppiata == 1:
- theSimpleQuery = "SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE " + data + " OR form.spec LIKE " + doubleddata + " ORDER BY form.idfor"
- else:
- theSimpleQuery = "SELECT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE form.spec LIKE " + data + " ORDER BY form.idfor"
-
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- return answer_table
- else:
-
- data=" OR form.spec LIKE ".join(entries)
- data_norm=" OR form.norm LIKE ".join(list_normalize(entries))
- doubleddata_norm=" OR form.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
- doubleddata=" OR form.spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = "SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE " + data +") OR (form.norm LIKE " + data_norm + ") OR (form.spec LIKE " + doubleddata + ") OR (form.norm LIKE " + doubleddata_norm + ")" + " ORDER BY form.idfor"
- else:
- theSimpleQuery = "SELECT DISTINCT form.spec AS forma, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore, pfl.nocc AS occ, form.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (form.spec LIKE " + data +") OR (form.norm LIKE " + data_norm + ")" + " ORDER BY form.idfor"
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- return answer_table"""
- #%% Funzione ricerca lemmi con vista forme
- def ricercalemmiforme (entries, path, espansa, raddoppiata):
- if espansa == 0:
- data = " OR form.spec LIKE ".join(entries)
- doubleddata = " OR form.spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE {data} OR form.spec LIKE {doubleddata} ORDER BY lem.idlem"
- else:
- theSimpleQuery = f"SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE {data} ORDER BY lem.idlem"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- return answer_table
- else:
- data = " OR lem.spec LIKE ".join(entries)
- data_norm = " OR lem.norm LIKE ".join(list_normalize(entries))
- doubleddata_norm = " OR lem.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
- doubleddata = " OR lem.spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = f"SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE {data}) OR (lem.norm LIKE {data_norm}) OR (lem.spec LIKE {doubleddata}) OR (lem.norm LIKE {doubleddata_norm}) ORDER BY lem.idlem"
- else:
- theSimpleQuery = f"SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE {data}) OR (lem.norm LIKE {data_norm}) ORDER BY lem.idlem"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- return answer_table
- #deprecated
- """if espansa == 0:
- data=" OR form.spec LIKE ".join(entries)
- doubleddata=" OR form.spec LIKE ".join(inizialeraddoppiata(entries))
-
- if raddoppiata == 1:
- theSimpleQuery = "SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE " + data + " OR form.spec LIKE " + doubleddata + " ORDER BY lem.idlem"
- else:
- theSimpleQuery = "SELECT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma,lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod != 0 AND lem.cod = pfl.lemma WHERE lem.spec LIKE " + data + " ORDER BY lem.idlem"
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- return answer_table
-
- else:
-
- data=" OR lem.spec LIKE ".join(entries)
- data_norm=" OR lem.norm LIKE ".join(list_normalize(entries))
- doubleddata_norm=" OR lem.norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
- doubleddata=" OR lem.spec LIKE ".join(inizialeraddoppiata(entries))
- if raddoppiata == 1:
- theSimpleQuery = "SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE " + data +") OR (lem.norm LIKE " + data_norm + ") OR (lem.spec LIKE " + doubleddata + ") OR (lem.norm LIKE " + doubleddata_norm + ")" + " ORDER BY lem.idlem"
- else:
- theSimpleQuery = "SELECT DISTINCT lem.spec AS lemma, lem.cat AS cat_gr, form.spec AS forma, lem.omo AS disambiguatore, pfl.nocc AS occ, lem.cod FROM pfl INNER JOIN form ON form.cod = pfl.forma INNER JOIN lem ON lem.cod = pfl.lemma WHERE (lem.spec LIKE " + data +") OR (lem.norm LIKE " + data_norm + ")" + " ORDER BY lem.idlem"
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- return answer_table"""
- # %% Ricerca per categorie grammaticali
- def ricercacatgr (entry, path):
- theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE cat = '{entry}' ORDER BY idlem"
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
- answer_table = pd.read_sql(theSimpleQuery, con)
- if answer_table.empty:
- print ("Nessun risultato")
- sys.exit(1)
- else:
- return answer_table
- #%%
- path = "/Users/leonardocanova/Library/CloudStorage/OneDrive-ConsiglioNazionaledelleRicerche/TIGRO/Ricerche/db/first_db"
- entry = "come"
- conditions = {"IQ": "TS", "[Titolo Abbreviato]": ["Simintendi, a. 1333 (prat.)"]}
- conditions2 = {"[Autore]": ["Dante Alighieri", "Boccaccio, Giovanni"]}
- chronoconditions = {"[Anno iniziale]": 1320, "[Anno finale]": 1375}
- subcorpus = definiscisottocorpus(path,"datibib", conditions)
- print(subcorpus)
- #df=ricercacatgr(entry, path)
- df=ricercalemmi(interpreter(entry), path, 0, 0, subcorpus)
- dtale.show(df)
- # %%
-
|