Procházet zdrojové kódy

Polars replacing Pandas

pinna před 6 měsíci
rodič
revize
aa6cc4f02f

+ 73 - 46
flask_be/engine/contexts.py

@@ -1,87 +1,111 @@
 import json
-import pandas as pd
+import polars as pl
 
 from .basic_queries import basicQueries
 from .utilities.format import formatAllContexts, formatContext
 
 
 # Executes query sequences to recover single and multiple contexts
-# Returns Pandas dataframes
+# Returns data in a dictionary format
 class contexts(basicQueries):
-    
+
     def __init__(self, dataConfig):
         super().__init__(dataConfig)
 
-    #%% funzione contesti multipli cumulativa
-    # Potrebbe essere unita alle cooccorrenze?
-    def contestimultipli (self, tipo_ricerca, ricerca, index = None):
-        ricercadf = pd.DataFrame(ricerca)
+    # %% funzione contesti multipli cumulativa
+    def contestimultipli(self, tipo_ricerca, ricerca, index=None):
+        ricercadf = pl.DataFrame(ricerca)
         textlist = self.findtexts(tipo_ricerca, ricercadf, index)
-        contexts = self.findcontexts (textlist)
-        bibliocontexts = self.findbib (contexts)
+        contexts = self.findcontexts(textlist)
+        bibliocontexts = self.findbib(contexts)
         highlights = formatAllContexts(bibliocontexts)
-        return highlights.to_dict(orient='records')
-
-    #%% funzione contesti singoli cumulativa
-    def contestosingolo (self, contestimultipli, indice, parole, periodi, brani):
-        ### droppa le colonne "highlight" che gli rompono le scatole###
-        contestimultipli = {k: v for k, v in contestimultipli.items() if not k.startswith('highlight')}
-        ###############################################################
-        contestimultiplidf = pd.DataFrame(contestimultipli, index=[0])
+
+        return highlights.to_dict(as_series=False)
+
+    # %% funzione contesti singoli cumulativa
+    def contestosingolo(self, contestimultipli, indice, parole, periodi, brani):
+        contestimultipli = {k: v for k, v in contestimultipli.items() if
+                            not k.startswith('highlight')}
+        contestimultiplidf = pl.DataFrame(contestimultipli)
         contestosingolo = self.singlecontexts(contestimultiplidf, indice, parole, periodi, brani)
         braniassociati = self.findlinks(contestosingolo)
-        contestosingoloclean = self.findbib (braniassociati)
+        contestosingoloclean = self.findbib(braniassociati)
         contestosingoloclean = formatAllContexts(contestosingoloclean)
-        return contestosingoloclean.to_dict(orient='records')
-        
-    #%% funzione reperimento e raffinamento contesti singoli
+
+        return contestosingoloclean.to_dict(as_series=False)
+
+    # %% funzione reperimento e raffinamento contesti singoli
     def singlecontexts(self, textlist, index, parole, periodi, brani):
-        context = textlist.iloc[index]
+        context = textlist.row(index).as_dict()
         contexts = []
         formats = []
         listOcc = self.listOcc
-        sigla = textlist.loc[index, "sigla"]
-        periodlocal = textlist.loc[index, "numperiod"]
-        ntxlocal = textlist.loc[index, "ntx"]
-        mappalocal = textlist.loc[index, "mappa"]
-        linkslocal = textlist.loc[index, "links"]
-        numbranolocal = textlist.loc[index, "numbrano"]
+        sigla = textlist[index, "sigla"]
+        periodlocal = textlist[index, "numperiod"]
+        ntxlocal = textlist[index, "ntx"]
+        mappalocal = textlist[index, "mappa"]
+        linkslocal = textlist[index, "links"]
+        numbranolocal = textlist[index, "numbrano"]
+        pointerlist = pl.DataFrame()
+
         if parole != 0:
-            pointerlist = pd.DataFrame()
             for table in listOcc:
-                queryData = {'queryType': 'singlecontext', 'querySubtype': 'parole', 'parole': parole, 'periodi': periodi, 'brani': brani, 'table': table, 'ntxlocal': ntxlocal, 'mappalocal': mappalocal, 'periodlocal': periodlocal, 'numbranolocal': numbranolocal}
-                queryresponse = self.queryHandler.query(queryData, pandas=True)
-                pointerlist = pd.concat([pointerlist, queryresponse])
-            fileQueryData = {'sigla': sigla, 'minChar': pointerlist["pitxt"].min(), 'maxChar': pointerlist["pitxt"].max()}
+                queryData = {'queryType': 'singlecontext', 'querySubtype': 'parole',
+                             'parole': parole, 'periodi': periodi, 'brani': brani, 'table': table,
+                             'ntxlocal': ntxlocal, 'mappalocal': mappalocal,
+                             'periodlocal': periodlocal, 'numbranolocal': numbranolocal}
+                queryresponse = self.queryHandler.query(queryData, polars=True)
+                pointerlist = pl.concat([pointerlist, queryresponse])
+            fileQueryData = {'sigla': sigla, 'minChar': pointerlist["pitxt"].min(),
+                             'maxChar': pointerlist["pitxt"].max()}
             cont, form = self.queryHandler.textQuery(fileQueryData, True)
             contexts.append(cont)
             formats.append(json.dumps(form))
-            context ['piniz'] = pointerlist["pitxt"].min()
-            context ['pfin'] = pointerlist["pitxt"].max()
+            context['piniz'] = pointerlist["pitxt"].min()
+            context['pfin'] = pointerlist["pitxt"].max()
+
         elif periodi != 0:
-            queryData = {'queryType': 'singlecontext', 'querySubtype': 'parole', 'parole': parole, 'periodi': periodi, 'brani': brani, 'table': table, 'ntxlocal': ntxlocal, 'mappalocal': mappalocal, 'periodlocal': periodlocal, 'numbranolocal': numbranolocal}
-            queryresponse = self.queryHandler.query(queryData, pandas=True)
-            fileQueryData = {'sigla': sigla, 'minChar': pointerlist["pitxt"].min(), 'maxChar': pointerlist["pitxt"].max()}
+            for table in listOcc:
+                queryData = {'queryType': 'singlecontext', 'querySubtype': 'parole',
+                             'parole': parole, 'periodi': periodi, 'brani': brani, 'table': table,
+                             'ntxlocal': ntxlocal, 'mappalocal': mappalocal,
+                             'periodlocal': periodlocal, 'numbranolocal': numbranolocal}
+                queryresponse = self.queryHandler.query(queryData, polars=True)
+                pointerlist = pl.concat([pointerlist, queryresponse])
+            fileQueryData = {'sigla': sigla, 'minChar': pointerlist["pitxt"].min(),
+                             'maxChar': pointerlist["pitxt"].max()}
             cont, form = self.queryHandler.textQuery(fileQueryData, True)
             contexts.append(cont)
             formats.append(json.dumps(form))
-            context ['piniz'] = queryresponse["piniz"].min()
-            context ['pfin'] = queryresponse["pfin"].max()
+            context['piniz'] = queryresponse["piniz"].min()
+            context['pfin'] = queryresponse["pfin"].max()
+
         elif brani != 0:
             if linkslocal == 0 or linkslocal == 1:
                 return "Nessun brano associato a questo contesto"
             else:
-                queryData = {'queryType': 'singlecontext', 'querySubtype': 'brani', 'parole': parole, 'periodi': periodi, 'brani': brani, 'table': table, 'ntxlocal': ntxlocal, 'mappalocal': mappalocal, 'periodlocal': periodlocal, 'numbranolocal': numbranolocal}
-                queryresponse = self.queryHandler.query(queryData, pandas=True)
-                fileQueryData = {'sigla': sigla, 'minChar': pointerlist["pitxt"].min(), 'maxChar': pointerlist["pitxt"].max()}
+                for table in listOcc:
+                    queryData = {'queryType': 'singlecontext', 'querySubtype': 'brani',
+                                 'parole': parole, 'periodi': periodi, 'brani': brani,
+                                 'table': table, 'ntxlocal': ntxlocal, 'mappalocal': mappalocal,
+                                 'periodlocal': periodlocal, 'numbranolocal': numbranolocal}
+                    queryresponse = self.queryHandler.query(queryData, polars=True)
+                    pointerlist = pl.concat([pointerlist, queryresponse])
+                fileQueryData = {'sigla': sigla, 'minChar': pointerlist["pitxt"].min(),
+                                 'maxChar': pointerlist["pitxt"].max()}
                 cont, form = self.queryHandler.textQuery(fileQueryData, True)
                 contexts.append(cont)
                 formats.append(json.dumps(form))
-                context ['piniz'] = queryresponse["piniz"].min()
-                context ['pfin'] = queryresponse["pfin"].max() 
+                context['piniz'] = queryresponse["piniz"].min()
+                context['pfin'] = queryresponse["pfin"].max()
+
         context['contesto'] = contexts[0]
         context['formattazione contesto'] = formats[0]
-        return pd.DataFrame(context).T.reset_index(drop=True)
+
+        # Trasponi il dizionario in un DataFrame di Polars
+        context_df = pl.DataFrame(context).melt()
+
+        return context_df
     
     #%% funzione reperimento note e brani associati
     def findlinks (self, context):
@@ -92,6 +116,7 @@ class contexts(basicQueries):
         pinizlocal = context.loc[0, "piniz"]
         pfinlocal = context.loc[0, "pfin"]
         if linkslocal == 0:
+
             return context
         if linkslocal == 1:
             queryData = {'queryType': 'links', 'querySubtype': 'nota', 'ntxlocal': ntxlocal, 'pinizlocal': pinizlocal, 'pitxtlocal': pitxtlocal, 'pfinlocal': pfinlocal}
@@ -101,6 +126,7 @@ class contexts(basicQueries):
             context['nota'] = cont
             context['formattazione nota'] = json.dumps(form)
             context['nota formattata'] = formatContext(json.dumps(form))
+
             return context
         if linkslocal == 2:
             queryData = {'queryType': 'links', 'querySubtype': 'testo_associato', 'ntxlocal': ntxlocal, 'pinizlocal': pinizlocal, 'pitxtlocal': pitxtlocal, 'pfinlocal': pfinlocal}
@@ -125,4 +151,5 @@ class contexts(basicQueries):
             context['testo associato'] = cont2
             context['formattazione testo associato'] = json.dumps(form2)
             context['testo associato formattato'] = formatContext(cont2, json.dumps(form2))
+
         return context

+ 27 - 27
flask_be/engine/cooccorrenze.py

@@ -1,58 +1,58 @@
-# %%
-import pandas as pd
+import polars as pl
 
 from .basic_queries import basicQueries
 from .utilities.format import formatAllContexts
 
 
 # Executes query sequences to recover contexts with co-occurrences according to user input
-# Returns Pandas dataframes
+# Returns data in a dictionary format
 class cooccorrenze(basicQueries):
-    
+
     def __init__(self, dataConfig):
         super().__init__(dataConfig)
 
-    #%% funzione ricerca per cooccorrenze. 
+    #%% funzione ricerca per cooccorrenze.
     # Ha in input un array di arrays del tipo:
     # [forma/lemma_cercati, tipo_ricerca, ricerca_espansa, iniziale_raddoppiata].
     # l'attributo tipo_ricerca ha come valori ammessi: 0 per forme, 1 per lemmi, 2 per lemmi + occorrenze non lemmatizzate.
     # Permette di definire l'intervallo di ricerca (in numero di parole), la possibilità di cercare soltanto all'interno dello stesso periodo (0/1) e/o di cercare le occorrenze in modo ordinato (0/1)
     def ricerca_cooccorrenze(self, listaricerche, intervallo, periodo, ordinate):
-        occurrences = [] # una lista di Dicts con i codici dei lemmi/forme da cercare
+        occurrences = []  # una lista di Dicts con i codici dei lemmi/forme da cercare
         for ricerca, tipo, espansa, raddoppiata in listaricerche:
-            if tipo==0:
-                res1 = self.sendBasicQuery(ricerca, 'forma', espansa, raddoppiata, pandas=True)
-                if res1.empty:
+            if tipo == 0:
+                res1 = self.sendBasicQuery(ricerca, 'forma', espansa, raddoppiata, polars=True)
+                if res1.is_empty():
                     return []
-                occurrences.append({'codList': list(res1['cod']), 'querySubtype': 0})
-            elif tipo==1:
-                res1 = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, pandas=True)
-                if res1.empty:
+                occurrences.append({'codList': res1['cod'].to_list(), 'querySubtype': 0})
+            elif tipo == 1:
+                res1 = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, polars=True)
+                if res1.is_empty():
                     return []
-                occurrences.append({'codList': list(res1['cod']), 'querySubtype': 1})
-            elif tipo==2:
-                res1 = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, pandas=True)
-                if res1.empty:
+                occurrences.append({'codList': res1['cod'].to_list(), 'querySubtype': 1})
+            elif tipo == 2:
+                res1 = self.sendBasicQuery(ricerca, 'lemma', espansa, raddoppiata, polars=True)
+                if res1.is_empty():
                     return []
-                codList = list(res1['cod'])
+                codList = res1['cod'].to_list()
                 subQueryData = {'queryType': 'pfl', 'codList': codList}
-                subdf = self.queryHandler.query(subQueryData, pandas=True)
-                formCodList = list(subdf['codForma'])
+                subdf = self.queryHandler.query(subQueryData, polars=True)
+                formCodList = subdf['codForma'].to_list()
                 occurrences.append({'codList': codList, 'formCodList': formCodList, 'querySubtype': 2})
 
-        if len(occurrences)==0:
+        if len(occurrences) == 0:
             return []
 
-        queryData = {'queryType': 'co-occurrences', 'occurrences': occurrences, 'intervallo': intervallo, 'periodo': periodo, 'ordinate': ordinate}
-        queryResponses = [self.queryHandler.query(dict(queryData, table=table), pandas=True) for table in self.listOcc]
-        listatesti = pd.concat(queryResponses)
+        queryData = {'queryType': 'co-occurrences', 'occurrences': occurrences, 'intervallo': intervallo,
+                     'periodo': periodo, 'ordinate': ordinate}
+        queryResponses = [self.queryHandler.query(dict(queryData, table=table), polars=True) for table in self.listOcc]
+        listatesti = pl.concat(queryResponses)
 
-        if listatesti.empty:
+        if listatesti.is_empty():
             return []
         else:
             contexts = self.findcontexts(listatesti)
             bibliocontexts = self.findbib(contexts)
-            clean = bibliocontexts.drop_duplicates(subset="contesto")
+            clean = bibliocontexts.unique(subset="contesto")
             highlights = formatAllContexts(clean)
 
-        return highlights.to_dict(orient='records')
+        return highlights.to_dict(orient='records')

+ 23 - 6
flask_be/engine/utilities/format.py

@@ -1,19 +1,36 @@
 import json
-import pandas as pd
+import polars as pl
 
 
 # Text highlighting
-def formatAllContexts(bibliocontexts: pd.DataFrame):
+def formatAllContexts(bibliocontexts: pl.DataFrame) -> pl.DataFrame:
     index = 0
     for col in bibliocontexts.columns:
         if col.startswith('pitxt'):
             if index == 0:
-                bibliocontexts['formattazione contesto'] = bibliocontexts.apply (lambda row: addHighlightToFormatting(row['formattazione contesto'], int(row['pitxt'] - row['piniz']), int(row['elemlen'])), axis=1)
+                bibliocontexts = bibliocontexts.with_column(
+                    pl.struct(['formattazione contesto', 'pitxt', 'piniz', 'elemlen']).apply(
+                        lambda x: addHighlightToFormatting(x['formattazione contesto'],
+                                                           int(x['pitxt'] - x['piniz']),
+                                                           int(x['elemlen']))
+                    ).alias('formattazione contesto')
+                )
             else:
-                bibliocontexts['formattazione contesto'] = bibliocontexts.apply (lambda row: addHighlightToFormatting(row['formattazione contesto'], int(row['pitxt_'+str(index)] - row['piniz']), int(row['elemlen_'+str(index)])), axis=1)
+                bibliocontexts = bibliocontexts.with_column(
+                    pl.struct([f'formattazione contesto', f'pitxt_{index}', 'piniz',
+                               f'elemlen_{index}']).apply(
+                        lambda x: addHighlightToFormatting(x['formattazione contesto'],
+                                                           int(x[f'pitxt_{index}'] - x['piniz']),
+                                                           int(x[f'elemlen_{index}']))
+                    ).alias('formattazione contesto')
+                )
             index += 1
-    
-    bibliocontexts['contesto formattato'] = bibliocontexts.apply (lambda row: formatContext(row['contesto'], row['formattazione contesto']), axis=1)
+
+    bibliocontexts = bibliocontexts.with_column(
+        pl.struct(['contesto', 'formattazione contesto']).apply(
+            lambda x: formatContext(x['contesto'], x['formattazione contesto'])
+        ).alias('contesto formattato')
+    )
 
     return bibliocontexts
 

+ 14 - 17
flask_be/interface_sqlite3/query_handlers.py

@@ -1,14 +1,15 @@
 import sqlite3
-import pandas as pd
+import polars as pl
 import interface_sqlite3.encdec.de_code as dc
 
 from interface_sqlite3.actual_queries import prepareQuery
 
+
 # First version
 class queryHandlerBasicSqlite:
 
     def __init__(self, dataConfig):
-        
+
         try:
             dbPath = dataConfig['dbPath']
             dbfileDefault = dataConfig.get('dbfile_default')
@@ -25,8 +26,7 @@ class queryHandlerBasicSqlite:
             keyPath = self.dbPath + 'keys/'
             self.keyRing = dc.keyRing(keyPath, self.dbEncoded, self.textsEncoded)
 
-    
-    def query(self, queryData, pandas=False, dbFile=None):
+    def query(self, queryData, polars=False, dbFile=None):
 
         # PREPARE THE QUERY
         # Formerly, a query string was pre-generated outside and
@@ -42,29 +42,26 @@ class queryHandlerBasicSqlite:
         dbFileLocal = dbFile if dbFile is not None else self.dbfileDefault
         if dbFileLocal is None:
             raise Exception("No db file specified with no default given -- can't execute query")
-        #
         db = self.dbPath + dbFileLocal
         connection = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
 
-
         # If the query is a simple string, execute it here:
-        if type(queryToExecute)==str:
-            if pandas:
-                results = pd.read_sql(queryToExecute, connection)
-                if(self.dbEncoded):
-                    results = self.db_results_decode_pandas(results)
+        if type(queryToExecute) == str:
+            if polars:
+                results = pl.read_sql(queryToExecute, connection)
+                if self.dbEncoded:
+                    results = self.db_results_decode_polars(results)
             else:
                 connection.row_factory = dict_factory
                 queryReponse = connection.cursor().execute(queryToExecute)
                 results = queryReponse.fetchall()
-                if(self.dbEncoded):
+                if self.dbEncoded:
                     results = self.db_results_decode(results)
-        
         else:
             # If not a string, 'queryToExecute' should be a method/function reference
             results = queryToExecute(connection, queryData)
-            if(self.dbEncoded):
-                results = self.db_results_decode_pandas(results)
+            if self.dbEncoded:
+                results = self.db_results_decode_polars(results)
 
         connection.close()
 
@@ -121,10 +118,10 @@ class queryHandlerBasicSqlite:
                     row[key] = dc.db_decode(self.keyRing.vettDictDec, value)
         return result
 
-    def db_results_decode_pandas(self, df):
+    def db_results_decode_polars(self, df):
         for col in df.columns:
             if isColumnToDecode(col):
-                df[col] = df[col].apply( lambda el: dc.db_decode(self.keyRing.vettDictDec, el) )
+                df = df.with_column(pl.col(col).apply(lambda el: dc.db_decode(self.keyRing.vettDictDec, el)))
         return df