query_handlers.py 5.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148
  1. import sqlite3
  2. import pandas as pd
  3. import interface_sqlite3.encdec.de_code as dc
  4. from interface_sqlite3.actual_queries import prepareQuery
  5. # First version
  6. class queryHandlerBasicSqlite:
  7. def __init__(self, dataConfig):
  8. try:
  9. dbPath = dataConfig['dbPath']
  10. dbfileDefault = dataConfig.get('dbfile_default')
  11. except:
  12. raise Exception('Missing required input in Data Provider Configuration')
  13. self.dbPath = dbPath
  14. self.dbfileDefault = dbfileDefault
  15. # Encoding
  16. self.dbEncoded = True if dataConfig.get("db_encoded") is True else False
  17. self.textsEncoded = True if dataConfig.get("texts_encoded") is True else False
  18. self.keyRing = None
  19. if self.dbEncoded or self.textsEncoded:
  20. keyPath = self.dbPath + 'keys/'
  21. self.keyRing = dc.keyRing(keyPath, self.dbEncoded, self.textsEncoded)
  22. def query(self, queryData, pandas=False, dbFile=None):
  23. # PREPARE THE QUERY
  24. # Formerly, a query string was pre-generated outside and
  25. # sent directly
  26. # Now the method processes a query data OBJECT
  27. # and creates the query (which may be complex)
  28. # accordingly
  29. queryData['pandas'] = pandas # Mostly redundant
  30. if self.dbEncoded:
  31. queryData = self.encodeQuery(queryData)
  32. queryToExecute = prepareQuery(queryData)
  33. # Get the connection to the DB
  34. dbFileLocal = dbFile if dbFile is not None else self.dbfileDefault
  35. if dbFileLocal is None:
  36. raise Exception("No db file specified with no default given -- can't execute query")
  37. #
  38. db = self.dbPath + dbFileLocal
  39. connection = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
  40. # If the query is a simple string, execute it here:
  41. if type(queryToExecute)==str:
  42. if pandas:
  43. results = pd.read_sql(queryToExecute, connection)
  44. if(self.dbEncoded):
  45. results = self.db_results_decode_pandas(results)
  46. else:
  47. connection.row_factory = dict_factory
  48. queryReponse = connection.cursor().execute(queryToExecute)
  49. results = queryReponse.fetchall()
  50. if(self.dbEncoded):
  51. results = self.db_results_decode(results)
  52. else:
  53. # If not a string, 'queryToExecute' should be a method/function reference
  54. results = queryToExecute(connection, queryData)
  55. if self.dbEncoded:
  56. if pandas:
  57. results = self.db_results_decode_pandas(results)
  58. else:
  59. results = self.db_results_decode(results)
  60. connection.close()
  61. return results
  62. def textQuery(self, queryData, getFormatting=False):
  63. try:
  64. sigla = queryData['sigla']
  65. minChar = queryData['minChar']
  66. maxChar = queryData['maxChar']
  67. except:
  68. return None
  69. with open(f"{self.dbPath}/itxt/{sigla}", 'r', encoding="utf-32-le") as file1:
  70. file1.seek(4*minChar)
  71. cont = file1.read(maxChar-minChar)
  72. if self.textsEncoded and self.keyRing.textKeys.get(sigla) is not None:
  73. key = self.keyRing.textKeys.get(sigla)
  74. cont = dc.decodeTextByKey(cont, key, minChar-1)
  75. if not getFormatting:
  76. return cont
  77. else:
  78. return cont, self.getTextFormatting(sigla, minChar, maxChar)
  79. def getTextFormatting(self, sigla, minChar, maxChar):
  80. with open(f"{self.dbPath}/ftxt/{sigla}", 'rb') as file1:
  81. file1.seek(minChar-1)
  82. formatCodes = [char for char in file1.read(maxChar-minChar)]
  83. return formatCodes
  84. def encodeQuery(self, queryData):
  85. type = queryData.get('queryType')
  86. if type in ["forma", "lemma", "formaLemma", "lemmaForma"]:
  87. try:
  88. data = queryData['data']
  89. dataNorm = queryData['dataNorm']
  90. data = [dc.db_encode(self.keyRing.vettDictEnc, datum) for datum in data]
  91. dataNorm = [dc.db_encode(self.keyRing.vettDictEnc, datum) for datum in dataNorm]
  92. queryData['data'] = data
  93. queryData['dataNorm'] = dataNorm
  94. except KeyError as err:
  95. raise KeyError('Missing required data for query type ' + type + ': ' + str(err))
  96. return queryData
  97. def db_results_decode(self, result):
  98. for row in result:
  99. for key, value in row.items():
  100. if isColumnToDecode(key):
  101. row[key] = dc.db_decode(self.keyRing.vettDictDec, value)
  102. return result
  103. def db_results_decode_pandas(self, df):
  104. for col in df.columns:
  105. if isColumnToDecode(col):
  106. df[col] = df[col].apply( lambda el: dc.db_decode(self.keyRing.vettDictDec, el) )
  107. return df
  108. # Utilities
  109. # Dict factory non-Pandas queries
  110. def dict_factory(cursor, row):
  111. fields = [column[0] for column in cursor.description]
  112. return {key: value for key, value in zip(fields, row)}
  113. # Does the column data (in returned results) need decoding?
  114. def isColumnToDecode(col):
  115. columns = ['forma', 'lemma', 'cat_gr', 'disambiguatore']
  116. if col in columns or col.startswith('highlight'):
  117. return True
  118. return False