|
@@ -11,7 +11,47 @@ import pandas as pd
|
|
|
import dtale
|
|
|
import unicodedata
|
|
|
import sys
|
|
|
-
|
|
|
+
|
|
|
+from test_sottocorpora import definiscisottocorpus
|
|
|
+
|
|
|
+def get_tables_occ(path):
|
|
|
+ conn = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
+ cursor = conn.cursor()
|
|
|
+ cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
|
|
|
+ table_names = cursor.fetchall()
|
|
|
+ occ_tables = [table[0] for table in table_names if table[0].startswith('Occ')]
|
|
|
+ cursor.close()
|
|
|
+ conn.close()
|
|
|
+ return occ_tables
|
|
|
+
|
|
|
+
|
|
|
+
|
|
|
+def findtexts(type, df, listOcc, path, subcorpus=None):
|
|
|
+ textlist = pd.DataFrame()
|
|
|
+ codlist = list(df["cod"])
|
|
|
+ con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
+ for table in listOcc:
|
|
|
+ strlist = ",".join(str(c) for c in codlist)
|
|
|
+ if type == 0:
|
|
|
+ Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.cod IN ({strlist})"
|
|
|
+ extendequeryReponse = pd.read_sql(Query, con)
|
|
|
+ textlist = pd.concat([textlist, extendequeryReponse])
|
|
|
+ elif type == 1:
|
|
|
+ Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist})"
|
|
|
+ extendequeryReponse = pd.read_sql(Query, con)
|
|
|
+ textlist = pd.concat([textlist, extendequeryReponse])
|
|
|
+ elif type == 2:
|
|
|
+ subquery = f"SELECT DISTINCT lemma, forma FROM pfl WHERE lemma IN ({strlist})"
|
|
|
+ subdf = pd.read_sql(subquery, con)
|
|
|
+ formcodlist = list(subdf["forma"])
|
|
|
+ strform = ",".join(str(c) for c in formcodlist)
|
|
|
+ Query = f"SELECT tab.cod, tab.indlem, tab.ntx, tab.pitxt, tab.elemlen, tab.mappa, tab.numperiod, tab.links, tab.numorg, intbib.sigla, tab.vol, tab.pag, tab.riga, tab.col, tab.tipostanza, tab.stanza, tab.verso, tab.numbrano, lem.spec AS lemma, lem.cat AS cat_gr, lem.omo AS disambiguatore FROM {table} AS tab INNER JOIN intbib ON tab.ntx = intbib.ntx INNER JOIN lem ON tab.indlem = lem.cod WHERE tab.indlem IN ({strlist}) OR (tab.indlem = 0 AND tab.cod IN ({strform}))"
|
|
|
+ extendequeryReponse = pd.read_sql(Query, con)
|
|
|
+ textlist = pd.concat([textlist, extendequeryReponse])
|
|
|
+ if subcorpus is not None:
|
|
|
+ filter = textlist['sigla'].isin(subcorpus)
|
|
|
+ textlist = textlist[filter]
|
|
|
+ return textlist
|
|
|
|
|
|
|
|
|
def combinations(s):
|
|
@@ -55,7 +95,7 @@ def counter (results):
|
|
|
|
|
|
|
|
|
|
|
|
-def ricercaforme (entries, path, espansa, raddoppiata):
|
|
|
+def ricercaforme (entries, path, espansa, raddoppiata, subcorpus=None):
|
|
|
if espansa == 0:
|
|
|
data=" OR spec LIKE ".join(entries)
|
|
|
doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
|
|
@@ -65,14 +105,6 @@ def ricercaforme (entries, path, espansa, raddoppiata):
|
|
|
else:
|
|
|
theSimpleQuery = f"SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE {data} ORDER BY idfor"
|
|
|
|
|
|
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- if answer_table.empty:
|
|
|
- print ("Nessun risultato")
|
|
|
- sys.exit(1)
|
|
|
- else:
|
|
|
- return answer_table
|
|
|
-
|
|
|
else:
|
|
|
|
|
|
data=" OR spec LIKE ".join(entries)
|
|
@@ -85,45 +117,25 @@ def ricercaforme (entries, path, espansa, raddoppiata):
|
|
|
else:
|
|
|
theSimpleQuery = f"SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idfor"
|
|
|
|
|
|
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- if answer_table.empty:
|
|
|
- print ("Nessun risultato")
|
|
|
- sys.exit(1)
|
|
|
- else:
|
|
|
- return answer_table
|
|
|
-
|
|
|
- """if espansa == 0:
|
|
|
-
|
|
|
- data=" OR spec LIKE ".join(entries)
|
|
|
- doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
|
|
|
-
|
|
|
- if raddoppiata == 1:
|
|
|
- theSimpleQuery = "SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE " + data + " OR spec LIKE " + doubleddata + "ORDER BY idfor"
|
|
|
- else:
|
|
|
- theSimpleQuery = "SELECT spec AS forma, nocc AS occ, cod FROM form WHERE spec LIKE " + data + " ORDER BY idfor"
|
|
|
-
|
|
|
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- return answer_table
|
|
|
-
|
|
|
+ con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
+ answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
+ if answer_table.empty:
|
|
|
+ print ("Nessun risultato")
|
|
|
+ sys.exit(1)
|
|
|
else:
|
|
|
-
|
|
|
- data=" OR spec LIKE ".join(entries)
|
|
|
- data_norm=" OR norm LIKE ".join(list_normalize(entries))
|
|
|
- doubleddata_norm=" OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
|
|
|
- doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
|
|
|
-
|
|
|
- if raddoppiata == 1:
|
|
|
- theSimpleQuery = "SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ") OR (spec LIKE " + doubleddata + ") OR (norm LIKE " + doubleddata_norm + ")" + " ORDER BY idfor"
|
|
|
+ if subcorpus == None:
|
|
|
+ return answer_table
|
|
|
else:
|
|
|
- theSimpleQuery = "SELECT DISTINCT spec AS forma, nocc AS occ, cod FROM form WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ")" + " ORDER BY idfor"
|
|
|
-
|
|
|
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- return answer_table"""
|
|
|
+ listOcc = get_tables_occ(path)
|
|
|
+ textlist = findtexts(0, answer_table, listOcc, path, subcorpus)
|
|
|
+ counts = textlist.groupby('cod').size().reset_index(name='count')
|
|
|
+ answer_table = pd.merge(answer_table, counts, on='cod', how='left')
|
|
|
+ answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int)
|
|
|
+ answer_table = answer_table.dropna(subset=['count'])
|
|
|
+ return answer_table
|
|
|
+
|
|
|
|
|
|
-def ricercalemmi (entries, path, espansa, raddoppiata):
|
|
|
+def ricercalemmi (entries, path, espansa, raddoppiata, subcorpus=None):
|
|
|
if espansa == 0:
|
|
|
data = " OR spec LIKE ".join(entries)
|
|
|
doubleddata = " OR spec LIKE ".join(inizialeraddoppiata(entries))
|
|
@@ -133,15 +145,8 @@ def ricercalemmi (entries, path, espansa, raddoppiata):
|
|
|
else:
|
|
|
theSimpleQuery = f"SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE {data} ORDER BY idlem"
|
|
|
|
|
|
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- if answer_table.empty:
|
|
|
- print ("Nessun risultato")
|
|
|
- sys.exit(1)
|
|
|
- else:
|
|
|
- return answer_table
|
|
|
-
|
|
|
else:
|
|
|
+
|
|
|
data = " OR spec LIKE ".join(entries)
|
|
|
data_norm = " OR norm LIKE ".join(list_normalize(entries))
|
|
|
doubleddata_norm = " OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
|
|
@@ -152,43 +157,23 @@ def ricercalemmi (entries, path, espansa, raddoppiata):
|
|
|
else:
|
|
|
theSimpleQuery = f"SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE {data}) OR (norm LIKE {data_norm}) ORDER BY idlem"
|
|
|
|
|
|
- con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- if answer_table.empty:
|
|
|
- print ("Nessun risultato")
|
|
|
- sys.exit(1)
|
|
|
+ con = sqlite3.connect(f"file:{path}/test1.db?mode=ro", uri=True)
|
|
|
+ answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
+ if answer_table.empty:
|
|
|
+ print ("Nessun risultato")
|
|
|
+ sys.exit(1)
|
|
|
+ else:
|
|
|
+ if subcorpus == None:
|
|
|
+ return answer_table
|
|
|
else:
|
|
|
+ listOcc = get_tables_occ(path)
|
|
|
+ textlist = findtexts(1, answer_table, listOcc, path, subcorpus)
|
|
|
+ counts = textlist.groupby('indlem').size().reset_index(name='count')
|
|
|
+ answer_table = pd.merge(answer_table, counts, left_on='cod', right_on='indlem')
|
|
|
+ answer_table['occ'] = answer_table['count'].fillna(answer_table['occ']).astype(int)
|
|
|
+ answer_table = answer_table.dropna(subset=['count'])
|
|
|
return answer_table
|
|
|
-
|
|
|
- """if espansa == 0:
|
|
|
-
|
|
|
- data=" OR spec LIKE ".join(entries)
|
|
|
- doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
|
|
|
|
|
|
- if raddoppiata == 1:
|
|
|
- theSimpleQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE " + data + " OR spec LIKE " + doubleddata + "ORDER BY idlem"
|
|
|
- else:
|
|
|
- theSimpleQuery = "SELECT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE spec LIKE " + data + " ORDER BY idlem"
|
|
|
-
|
|
|
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- return answer_table
|
|
|
-
|
|
|
- else:
|
|
|
-
|
|
|
- data=" OR spec LIKE ".join(entries)
|
|
|
- data_norm=" OR norm LIKE ".join(list_normalize(entries))
|
|
|
- doubleddata_norm=" OR norm LIKE ".join(list_normalize(inizialeraddoppiata(entries)))
|
|
|
- doubleddata=" OR spec LIKE ".join(inizialeraddoppiata(entries))
|
|
|
-
|
|
|
- if raddoppiata == 1:
|
|
|
- theSimpleQuery = "SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ") OR (spec LIKE " + doubleddata + ") OR (norm LIKE " + doubleddata_norm + ")" + " ORDER BY idlem"
|
|
|
- else:
|
|
|
- theSimpleQuery = "SELECT DISTINCT spec AS lemma, cat AS cat_gr, omo AS disambiguatore, nocc AS occ, cod FROM lem WHERE (spec LIKE " + data +") OR (norm LIKE " + data_norm + ")" + " ORDER BY idlem"
|
|
|
-
|
|
|
- con = sqlite3.connect("file:" + path + "/db/test1.db" + "?mode=ro", uri=True)
|
|
|
- answer_table = pd.read_sql(theSimpleQuery, con)
|
|
|
- return answer_table"""
|
|
|
|
|
|
def ricercaformelemmi (entries, path, espansa, raddoppiata):
|
|
|
if espansa == 0:
|
|
@@ -328,9 +313,14 @@ def ricercacatgr (entry, path):
|
|
|
|
|
|
|
|
|
path = "/Users/leonardocanova/Library/CloudStorage/OneDrive-ConsiglioNazionaledelleRicerche/TIGRO/Ricerche/db/first_db"
|
|
|
-entry = "filius"
|
|
|
+entry = "come"
|
|
|
+conditions = {"IQ": "TS", "[Titolo Abbreviato]": ["Simintendi, a. 1333 (prat.)"]}
|
|
|
+conditions2 = {"[Autore]": ["Dante Alighieri", "Boccaccio, Giovanni"]}
|
|
|
+chronoconditions = {"[Anno iniziale]": 1320, "[Anno finale]": 1375}
|
|
|
+subcorpus = definiscisottocorpus(path,"datibib", conditions)
|
|
|
+print(subcorpus)
|
|
|
|
|
|
-df=ricercaforme(interpreter(entry), path, 0, 0)
|
|
|
+df=ricercalemmi(interpreter(entry), path, 0, 0, subcorpus)
|
|
|
dtale.show(df)
|
|
|
|
|
|
|