Browse Source

Encoding/Decoding implementation to test

Francesco 1 year ago
parent
commit
d581aee925

+ 0 - 0
flask_be/engine/data_interface/encdec/keys/aa1.csv → db/ndg2.gat4/keys/key_aa1.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/cd.csv → db/ndg2.gat4/keys/key_cd.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/ck.csv → db/ndg2.gat4/keys/key_ck.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/d17.csv → db/ndg2.gat4/keys/key_d17.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/fq.csv → db/ndg2.gat4/keys/key_fq.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/g.csv → db/ndg2.gat4/keys/key_g.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/i51.csv → db/ndg2.gat4/keys/key_i51.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/iya.csv → db/ndg2.gat4/keys/key_iya.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/lb1.csv → db/ndg2.gat4/keys/key_lb1.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/od2.csv → db/ndg2.gat4/keys/key_od2.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/p07.csv → db/ndg2.gat4/keys/key_p07.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/rd.csv → db/ndg2.gat4/keys/key_rd.csv


+ 0 - 0
flask_be/engine/data_interface/encdec/keys/vettSpec.csv → db/ndg2.gat4/keys/vettSpec.csv


+ 1 - 1
flask_be/engine/data_interface/QueryHandlerAbstract.py

@@ -1,7 +1,7 @@
 # An abstract class template for query handler factory return object
 # try a 'naive' implementation without using abc
 class QueryHandlerAbstract():
-    def __init__(self, dbPath, dbfileDefault):
+    def __init__(self):
         if type(self) is QueryHandlerAbstract:
             raise Exception("QueryHandlerAbstract is an abstract class and can't be instantiated")
     

+ 1 - 6
flask_be/engine/data_interface/data_providers_setup.py

@@ -4,12 +4,7 @@ from interface_sqlite3.query_handlers import queryHandlerBasicSqlite
 def queryHandlerFactory(dataConfig):
 
     if dataConfig.get('data_interface')=='sqlite3':
-        try:
-            dbPath = dataConfig['dbPath']
-            dbfileDefault = dataConfig['dbfile_default']
-        except:
-            raise Exception('Missing required input in Data Provider Configuration')
-        return queryHandlerBasicSqlite(dbPath, dbfileDefault)
+        return queryHandlerBasicSqlite(dataConfig)
     else:
         raise Exception('Unrecognized data interface in app configuration: ' + dataConfig.get('data_interface'))
 

+ 0 - 0
flask_be/engine/data_interface/encdec/__init__.py → flask_be/interface_sqlite3/encdec/__init__.py


+ 50 - 26
flask_be/engine/data_interface/encdec/de_code.py → flask_be/interface_sqlite3/encdec/de_code.py

@@ -1,25 +1,51 @@
 #%%
 import csv
+from os import listdir
 
-KEY_PATH = 'keys/'# to be set in config
+class keyRing:
 
-def getVettSpec():
-    with open(KEY_PATH + "vettSpec.csv", 'r') as file1:
-        reader = csv.DictReader(file1)
-        vettSpec = [row for row in reader]
-        return vettSpec
+    def __init__(self, keyPath, dbEncoded, textsEncoded):
+        self.keyPath = keyPath
+        self.vettSpec = self.getVettSpec(dbEncoded)
+        self.textKeys = self.getKeys(textsEncoded)
 
-vettSpec = getVettSpec() # also in config
+    def getVettSpec(self, dbEncoded):
+        if not dbEncoded:
+            return None
+        with open(self.keyPath + "vettSpec.csv", 'r') as file1:
+            reader = csv.DictReader(file1)
+            vettSpec = [row for row in reader]
+            return vettSpec
 
-def getKeyByCode(code):
-    with open(KEY_PATH + code + ".csv", 'r') as file1:
-        reader = csv.reader(file1)
-        key = [int(row[0]) for index, row in enumerate(reader) if index>1]
-        return key
-# %%
+    def getKeys(self, textsEncoded):
+        if not textsEncoded:
+            return None
+        files = listdir(self.keyPath)
+        keyFiles = [file for file in files if (file.startswith('key_') and file.endswith('.csv'))]
 
+        keys = {}
+        for keyFile in keyFiles:
+            code = keyFile.replace('key_', '').replace('.csv', '')
+            try:
+                keys[code] = self.getKeyByCode(keyFile)
+            except:
+                pass
+
+    def getKeyByCode(self, keyFile):
+        with open(self.keyPath + keyFile, 'r') as file1:
+            reader = csv.reader(file1)
+            key = [int(row[0]) for index, row in enumerate(reader) if index>1]
+            return key
+
+
+# Encoder/Decoders
+
+# DB field encoder/decoder
+# DB Columns that need this:
+# FORM -> norm, spec, invnorm, invspec
+# LEM  -> norm, spec, invnorm, invspec, cat, omo
+def db_decode(vettSpec, string0):
 
-def db_decode(string0):
     res = ""
     for char0 in string0:
         #1
@@ -29,8 +55,8 @@ def db_decode(string0):
         #3
         res += chr(int(char0ConvDec, 16)) # Si converte il codice esadecimale a decimale e si usa la built-in chr per recuperare il carattere
     return res
-
-def db_encode(string0):
+#
+def db_encode(vettSpec, string0):
     res = ""
     for char0 in string0:
         #1
@@ -41,9 +67,7 @@ def db_encode(string0):
         res += chr(int(char0ConvDec)) # Si usa la built-in chr per recuperare il carattere
     return res
 
-def code(char0, shift):
-    return chr(ord(char0) + shift)
-
+# Text encoder/decoder
 def decodeTextByKey(text, key, startInFile):
     initialOffset = startInFile % len(key)
     res = ""
@@ -51,19 +75,19 @@ def decodeTextByKey(text, key, startInFile):
         offset = k + initialOffset
         if offset >= len(key):
             offset = offset % len(key)
-        res += code(char0, -key[offset])
+        res += shiftchar(char0, -key[offset])
     return res
-
+#
 def codeTextByKey(text, key, startInFile):
+
     initialOffset = startInFile % len(key)
     res = ""
     for k, char0 in enumerate(text):
         offset = k + initialOffset
         if offset >= len(key):
             offset = offset % len(key)
-        res += code(char0, +key[offset])
+        res += shiftchar(char0, +key[offset])
     return res
-
-# %%
-getKeyByCode('p07')
-# %%
+#
+def shiftchar(char0, shift):
+    return chr(ord(char0) + shift)

+ 65 - 8
flask_be/interface_sqlite3/query_handlers.py

@@ -1,21 +1,39 @@
 import sqlite3
 import pandas as pd
+import interface_sqlite3.encdec.de_code as dc
 
 from engine.data_interface.QueryHandlerAbstract import QueryHandlerAbstract
 
 # First version
 class queryHandlerBasicSqlite(QueryHandlerAbstract):
 
-    def __init__(self, dbPath, dbfileDefault):
+    def __init__(self, dataConfig):
+        
+        try:
+            dbPath = dataConfig['dbPath']
+            dbfileDefault = dataConfig['dbfile_default']
+        except:
+            raise Exception('Missing required input in Data Provider Configuration')
         self.dbPath = dbPath
         self.dbfileDefault = dbfileDefault
+
+        # Encoding
+        self.dbEncoded = True if dataConfig.get("db_encoded") is True else False
+        self.textsEncoded = True if dataConfig.get("texts_encoded") is True else False
+        self.keyring = None
+        if self.dbEncoded or self.textsEncoded:
+            keyPath = self.dbPath + 'keys/'
+            self.keyRing = dc.keyRing(keyPath, self.dbEncoded, self.textsEncoded)
+
     
     def query(self, queryData, pandas=False, dbFile=None):
 
         # Formerly the query string was pre-generated outside and
-        # sent here in lieu of the query data
+        # sent here _in lieu_ of the query data
         # Now the method processes a query data OBJECT and creates the query
         # accordingly
+        if self.dbEncoded:
+            queryData = self.encodeQuery(queryData)
         queryString = prepareQueryString(queryData)
 
         dbfileLocal = dbFile if dbFile is not None else self.dbfileDefault
@@ -26,13 +44,18 @@ class queryHandlerBasicSqlite(QueryHandlerAbstract):
         # PANDAS?
         if pandas:
             results = pd.read_sql(queryString, connection)
+            if(self.dbEncoded):
+                results = self.db_results_decode_pandas(results)
         
         else:
             connection.row_factory = dict_factory
             queryReponse = connection.cursor().execute(queryString)
             results = queryReponse.fetchall()
+            if(self.dbEncoded):
+                results = self.db_results_decode(results)
 
         connection.close()
+
         return results
     
     def textQuery(self, queryData):
@@ -46,7 +69,38 @@ class queryHandlerBasicSqlite(QueryHandlerAbstract):
         with open(f"{self.dbPath}/itxt/{sigla}", 'r', encoding="utf-32-le") as file1:
             file1.seek(4*minChar)
             cont = file1.read(maxChar-minChar)
-            return cont
+        
+        if self.textsEncoded and self.keyRing.get(sigla) is not None:
+            key = self.keyRing.get(sigla)
+            cont = dc.decodeTextByKey(cont, key)
+        
+        return cont
+    
+    def encodeQuery(self, queryData):
+        type = queryData.get('queryType')
+        if type in ["forma", "lemma", "formaLemma", "lemmaForma"]:
+            try:
+                data = queryData['data']
+                dataNorm = queryData['dataNorm']
+            except KeyError as err:
+                raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
+        data = [dc.db_encode(self.keyRing.vettSpec, datum) for datum in data]
+        dataNorm = [dc.db_encode(self.keyRing.vettSpec, datum) for datum in dataNorm]
+        queryData['data'] = data
+        queryData['dataNorm'] = dataNorm
+
+    def db_results_decode(self, result):
+        for row in result:
+            for key, value in row:
+                if isColumnToDecode(key):
+                    value = dc.db_decode(self.keyRing.vettSpec, value)
+        return result
+
+    def db_results_decode_pandas(self, df):
+        for col in df.columns:
+            if isColumnToDecode(col):
+                df[col] = df[col].apply( lambda el: dc.db_decode(self.keyRing.vettSpec, el) )
+        return df
 
 
 
@@ -189,14 +243,17 @@ def prepareQueryString(queryData):
     #####
     else:
         raise ValueError('Unrecognized query type: ' + type)
-    
 
 
-
-
-
-# Utility for non-Pandas queries
+# Dict factory non-Pandas queries
 def dict_factory(cursor, row):
     fields = [column[0] for column in cursor.description]
     return {key: value for key, value in zip(fields, row)}
 
+
+# Does the column data (in returned results) need decoding?
+def isColumnToDecode(col):
+    columns = ['forma', 'lemma', 'cat_gr', 'disambiguatore']
+    if col in columns or col.startswith('highlight'):
+        return True
+    return False