lemmario_v4.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # %%
  2. import xml.etree.ElementTree as ET
  3. import os
  4. import csv
  5. from collections import defaultdict
  6. import re
  7. import json
  8. # %%
  9. baseDir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML/'
  10. # %%
  11. # PREREQUISITE
  12. # Used to standardize lems in Gatto output xml files for easier parsing
  13. def surroundLems(letterRoot):
  14. textRoot = list(letterRoot.iter('div'))[0]
  15. texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w']
  16. doit = False
  17. for node in texttags:
  18. if doit and node.tag=='w':
  19. node.tag = 'NEWlem'
  20. node.attrib = prev_node.attrib
  21. prev_node.tag = 'OLDlem'
  22. if node.tag == 'lem' and node.tail != None:
  23. thelem = re.findall(r'\w+', node.tail)[0] # First word
  24. node.text = thelem
  25. node.tail = node.tail.replace(thelem, '')
  26. doit = False
  27. else:
  28. doit = True
  29. prev_node = node
  30. for node in textRoot.findall('OLDlem'):
  31. textRoot.remove(node)
  32. for node in textRoot.findall('NEWlem'):
  33. node.tag = 'lem'
  34. return textRoot
  35. # %%
  36. # Extract lems from Gatto xml files
  37. lemmiGatXml = {} # Output dict, storing lems from Gatto files; the keys are the OVI 'sigle'
  38. #
  39. basepath_gat_xml = baseDir + 'xmlgat'
  40. for entry in os.listdir(basepath_gat_xml): # loop on all files in the basepath_gat directory
  41. if os.path.isfile(os.path.join(basepath_gat_xml, entry)): # prolly redundant
  42. gg = entry.split('.')[1]
  43. if gg != '':
  44. lemmiGatXml[gg]={"Filename": entry}
  45. parsingProblems = []
  46. for sigla, value in lemmiGatXml.items():
  47. try:
  48. value["lemmi"]=[]
  49. pluto = surroundLems(ET.parse(os.path.join(basepath_gat_xml, value["Filename"])).getroot())
  50. for lem in pluto.iter('lem'):
  51. lemRef = {'lemma': lem.text, 'num_lemma': lem.attrib['n'], 'num_iperlemma': lem.attrib['type']}
  52. value["lemmi"].append(lemRef)
  53. except:
  54. print('Error in parsing sigla:', sigla)
  55. parsingProblems.append(sigla)
  56. # %%
  57. # Extract lems from Gatto txt files
  58. lemmiGatTxt = {}
  59. #
  60. basepath_gat_txt = baseDir + 'lemmi_txt'
  61. for entry in os.listdir(basepath_gat_txt):
  62. if os.path.isfile(os.path.join(basepath_gat_txt, entry)):
  63. ll = entry.split('.')[1]
  64. if ll != '':
  65. lemmiGatTxt[ll]={"Filename": entry}
  66. for sigla, value in lemmiGatTxt.items():
  67. #Cambia percorso
  68. value["lemmi"]=[]
  69. f = open(os.path.join(basepath_gat_txt, value["Filename"]), "r", encoding='latin-1')
  70. lines = f.readlines()
  71. for line in lines:
  72. prelem = re.split('\|', line)
  73. lem = [el.strip() for el in prelem]
  74. value["lemmi"].append(lem)
  75. # %%
  76. print(lemmiGatTxt['l95'])
  77. print(lemmiGatXml['l95'])
  78. # %%
  79. # Do a redundant list of all lemmas (with repetitions) from the files
  80. redundantLemmas = []
  81. for sigla, valueTxt in lemmiGatTxt.items():
  82. valueXml = lemmiGatXml[sigla]
  83. iperLemmiLocal = list(filter(lambda row: 'IPERLEMMA' in row[0], valueTxt['lemmi']))
  84. for lemTxt in valueTxt['lemmi']:
  85. newLemTxt = lemTxt.copy()
  86. newLemTxt.insert(0, sigla)
  87. lemXml = next( filter(lambda el: el['num_lemma']==lemTxt[0], valueXml['lemmi']) , None)
  88. if lemXml is not None and lemXml['num_iperlemma']!='0':
  89. num_iperlemma = int(lemXml['num_iperlemma'])
  90. iperlemma = iperLemmiLocal[num_iperlemma-1]
  91. newLemTxt.append(iperlemma[1])
  92. else:
  93. newLemTxt.append('')
  94. redundantLemmas.append(newLemTxt)
  95. # %%
  96. # From 'redundantLemmas' generate a formatted json object without repetitions
  97. preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
  98. prefinal = list(map(lambda row: ((row[2], row[3], row[4], row[5]), (row[0], row[1])), preprefinal))
  99. print(len(prefinal))
  100. tmp = defaultdict(list)
  101. for k, v in prefinal: tmp[k].append(v)
  102. finalC = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2], 'iperlemma': k[3]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()]
  103. finalC.sort(key=lambda el: el['lemma']['forma_standard'])
  104. print(len(finalC))
  105. for ii, item in enumerate(finalC):
  106. item['id'] = ii
  107. # %%
  108. # Export the json to file
  109. with open(baseDir + "power_lemmarioC.json", "w") as outfile:
  110. json.dump(finalC, outfile, indent=2)
  111. # %%
  112. # IPERLEMMI - DA RIPENSARE
  113. #preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
  114. #prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))
  115. #
  116. #finalIPER = list(set(prefinalIPER))
  117. #finalIPER.sort()
  118. # %%
  119. #iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
  120. #csvwriter = csv.writer(iperlem_data_unique)
  121. #
  122. #csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])
  123. #
  124. #for line in finalIPER:
  125. # csvwriter.writerow(line)
  126. #
  127. #iperlem_data_unique.close()