lemmario_v4.py 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224
  1. # %%
  2. import xml.etree.ElementTree as ET
  3. import os
  4. import csv
  5. from collections import defaultdict
  6. import re
  7. import json
  8. # %%
  9. baseDir = '../../DATA/OVI/datiniXML/'
  10. # %%
  11. # PREREQUISITE
  12. # Used to standardize lems in Gatto output xml files for easier parsing
  13. def surroundLems(letterRoot):
  14. textRoot = list(letterRoot.iter('div'))[0]
  15. texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w']
  16. doit = False
  17. for node in texttags:
  18. if doit and node.tag=='w':
  19. node.tag = 'NEWlem'
  20. node.attrib = prev_node.attrib
  21. prev_node.tag = 'OLDlem'
  22. if node.tag == 'lem' and node.tail != None:
  23. thelem = re.findall(r'\w+', node.tail)[0] # First word
  24. node.text = thelem
  25. node.tail = node.tail.replace(thelem, '')
  26. doit = False
  27. else:
  28. doit = True
  29. prev_node = node
  30. for node in textRoot.findall('OLDlem'):
  31. textRoot.remove(node)
  32. for node in textRoot.findall('NEWlem'):
  33. node.tag = 'lem'
  34. return textRoot
  35. # %%
  36. # Extract lems from Gatto xml files
  37. lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle'
  38. #
  39. basepath_gat = baseDir + 'xmlgat'
  40. for entry in os.listdir(basepath_gat): # loop on all files in the basepath_gat directory
  41. if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
  42. gg = entry.split('.')[1]
  43. if gg != '':
  44. lemmiGatXml[gg]={"Filename": entry}
  45. for sigla, value in lemmiGatXml.items():
  46. try:
  47. pluto = surroundLems(ET.parse(basepath_gat+'/'+value["Filename"]).getroot())
  48. value["lemmi"]=[]
  49. for lem in pluto.iter('lem'):
  50. lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']}
  51. value["lemmi"].append(lemRef)
  52. except:
  53. print('Error in parsing file:', sigla)
  54. # %%
  55. lemmiGatTxt = []
  56. basepath_lemmi = baseDir + 'lemmi_txt'
  57. for entry in os.listdir(basepath_lemmi):
  58. if os.path.isfile(os.path.join(basepath_lemmi, entry)):
  59. ll = entry.split('.')[1]
  60. if ll != '':
  61. lemmiGatTxt.append([ll, entry])
  62. # %%
  63. xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
  64. root = xmlparse.getroot()
  65. biblio = root.findall("Biblio")
  66. sigle = []
  67. for bib in biblio:
  68. sigla = bib.find("sigla")
  69. sigle.append(sigla.text.lower())
  70. # %%
  71. OVI_data = open(baseDir + 'OVI_Data.csv', 'w')
  72. csvwriter = csv.writer(OVI_data)
  73. params = ["BiblioDatini", "lemmi_txt", "xmlgat"]
  74. csvwriter.writerow(params)
  75. for sigla in sigle:
  76. row = [sigla]
  77. no_lemma = " "
  78. lemma = " "
  79. gatto = " "
  80. row.append(no_lemma)
  81. for x in range(len(lemmiGatTxt)):
  82. if sigla == lemmiGatTxt[x][0]:
  83. lemma = lemmiGatTxt[x][1]
  84. row.append(lemma)
  85. try:
  86. gatto = lemmiGatXml[sigla]["File"]
  87. row.append(gatto)
  88. except KeyError:
  89. pass
  90. csvwriter.writerow(row)
  91. OVI_data.close()
  92. # %%
  93. iperlem_data = open(baseDir + 'lem_Data.csv', 'w')
  94. csvwriter = csv.writer(iperlem_data)
  95. params = ["sigla", "file", "num", "lemma", "iperlemma", "commento", "livello"]
  96. csvwriter.writerow(params)
  97. def write_lines(lines, sig, file):
  98. for line in lines:
  99. row = [sig, file]
  100. lem = re.split('\|', line)
  101. for l in lem:
  102. m = l.strip()
  103. row.append(m)
  104. try:
  105. gatLems = lemmiGatXml[sig]['lemmi']
  106. thisGatLem = next(filter(lambda el: el['num_lemma']==row[2], gatLems), '')
  107. row.insert(4, thisGatLem['num_iperlemma'])
  108. except:
  109. row.insert(4, '')
  110. csvwriter.writerow(row)
  111. for x in range(len(lemmiGatTxt)):
  112. sigla = lemmiGatTxt[x][0]
  113. file_name = lemmiGatTxt[x][1]
  114. #Cambia percorso
  115. f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
  116. lines = f.readlines()
  117. write_lines(lines, sigla, file_name)
  118. f.close()
  119. iperlem_data.close()
  120. # %%
  121. redundantLemmas = []
  122. def write_lines_here(lines, sig):
  123. toRet = []
  124. for line in lines:
  125. row = [sig]
  126. lem = re.split('\|', line)
  127. for l in lem:
  128. m = l.strip()
  129. row.append(m)
  130. toRet.append(row)
  131. return toRet
  132. for x in range(len(lemmiGatTxt)):
  133. sigla = lemmiGatTxt[x][0]
  134. file_name = lemmiGatTxt[x][1]
  135. #Cambia percorso
  136. f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
  137. lines = f.readlines()
  138. redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)
  139. f.close()
  140. print(len(redundantLemmas))
  141. # %%
  142. preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
  143. prefinal = list(map(lambda row: ((row[2], row[3], row[4]), (row[0], row[1])), preprefinal))
  144. print(len(prefinal))
  145. tmp = defaultdict(list)
  146. for k, v in prefinal: tmp[k].append(v)
  147. final = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: {'file': el[0], 'n': el[1]}, v))} for k,v in tmp.items()]
  148. finalB = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()]
  149. final.sort(key=lambda el: el['lemma']['forma_standard'])
  150. finalB.sort(key=lambda el: el['lemma']['forma_standard'])
  151. print(len(final))
  152. print(len(finalB))
  153. # %%
  154. for ii, item in enumerate(finalB):
  155. item['id'] = ii
  156. # %%
  157. # IPERLEMMI
  158. preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
  159. prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))
  160. finalIPER = list(set(prefinalIPER))
  161. finalIPER.sort()
  162. # %%
  163. lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')
  164. csvwriter = csv.writer(lem_data_unique)
  165. csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])
  166. for line in final:
  167. csvwriter.writerow(line)
  168. lem_data_unique.close()
  169. # IPERLEMMI
  170. iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
  171. csvwriter = csv.writer(iperlem_data_unique)
  172. csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])
  173. for line in finalIPER:
  174. #print(line)
  175. csvwriter.writerow(line)
  176. iperlem_data_unique.close()
  177. # %%
  178. #cat_gramm = set(map(lambda entry: entry[1], final))
  179. #cat_gramm2 = list(cat_gramm)
  180. #cat_gramm2.sort()
  181. #print(cat_gramm2)
  182. # %%
  183. final[2]
  184. print(final[2])
  185. # %%
  186. with open(baseDir + "power_lemmarioC.json", "w") as outfile:
  187. json.dump(finalB, outfile, indent=2)
  188. # %%