lemmario_v3.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179
  1. # %%
  2. import xml.etree.ElementTree as ET
  3. import os
  4. import csv
  5. from collections import OrderedDict, defaultdict
  6. import re
  7. import json
  8. # %%
  9. baseDir = '/Users/federicaspinelli/TEAMOVI/Parser/Data/OVI/datiniXML/'
  10. # %%
  11. gat = []
  12. basepath_gat = baseDir + 'xmlgat'
  13. for entry in os.listdir(basepath_gat):
  14. if os.path.isfile(os.path.join(basepath_gat, entry)): # prolly redundant
  15. gg = entry.split('.')[1]
  16. if gg != '':
  17. gat.append([gg, entry])
  18. lemmi = []
  19. basepath_lemmi = baseDir + 'lemmi_txt'
  20. for entry in os.listdir(basepath_lemmi):
  21. if os.path.isfile(os.path.join(basepath_lemmi, entry)):
  22. ll = entry.split('.')[1]
  23. if ll != '':
  24. lemmi.append([ll, entry])
  25. # %%
  26. xmlparse = ET.parse(baseDir + 'BiblioDatini.xml')
  27. root = xmlparse.getroot()
  28. biblio = root.findall("Biblio")
  29. sigle = []
  30. for bib in biblio:
  31. sigla = bib.find("sigla")
  32. sigle.append(sigla.text.lower())
  33. # %%
  34. OVI_data = open(baseDir + 'OVI_Data.csv', 'w')
  35. csvwriter = csv.writer(OVI_data)
  36. params = ["BiblioDatini", "lemmi_txt", "xmlgat"]
  37. csvwriter.writerow(params)
  38. for sigla in sigle:
  39. row = [sigla]
  40. no_lemma = " "
  41. lemma = " "
  42. gatto = " "
  43. row.append(no_lemma)
  44. for x in range(len(lemmi)):
  45. if sigla == lemmi[x][0]:
  46. lemma = lemmi[x][1]
  47. row.append(lemma)
  48. for x in range(len(gat)):
  49. if sigla == gat[x][0]:
  50. gatto = gat[x][1]
  51. row.append(gatto)
  52. csvwriter.writerow(row)
  53. OVI_data.close()
  54. # %%
  55. iperlem_data = open(baseDir + 'lem_Data.csv', 'w')
  56. csvwriter = csv.writer(iperlem_data)
  57. params = ["sigla", "file", "num", "lemma", "commento", "livello"]
  58. csvwriter.writerow(params)
  59. def write_lines(lines, sig, file):
  60. for line in lines:
  61. row = [sig, file]
  62. lem = re.split('\|', line)
  63. for l in lem:
  64. m = l.strip()
  65. row.append(m)
  66. csvwriter.writerow(row)
  67. for x in range(len(lemmi)):
  68. sigla = lemmi[x][0]
  69. file_name = lemmi[x][1]
  70. #Cambia percorso
  71. f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
  72. lines = f.readlines()
  73. write_lines(lines, sigla, file_name)
  74. f.close()
  75. iperlem_data.close()
  76. # %%
  77. redundantLemmas = []
  78. def write_lines_here(lines, sig):
  79. toRet = []
  80. for line in lines:
  81. row = [sig]
  82. lem = re.split('\|', line)
  83. for l in lem:
  84. m = l.strip()
  85. row.append(m)
  86. toRet.append(row)
  87. return toRet
  88. for x in range(len(lemmi)):
  89. sigla = lemmi[x][0]
  90. file_name = lemmi[x][1]
  91. #Cambia percorso
  92. f = open(baseDir + 'lemmi_txt/' + file_name, "r", encoding='latin-1')
  93. lines = f.readlines()
  94. redundantLemmas = redundantLemmas + write_lines_here(lines, sigla)
  95. f.close()
  96. print(len(redundantLemmas))
  97. # %%
  98. preprefinal = list(filter(lambda row: 'IPERLEMMA' not in row[1], redundantLemmas))
  99. prefinal = list(map(lambda row: ((row[2], row[3], row[4]), (row[0], row[1])), preprefinal))
  100. print(len(prefinal))
  101. tmp = defaultdict(list)
  102. for k, v in prefinal: tmp[k].append(v)
  103. final = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: {'file': el[0], 'n': el[1]}, v))} for k,v in tmp.items()]
  104. finalB = [{'lemma': {'forma_standard': k[0], 'categoria': k[1], 'note': k[2]}, 'coordinate': list(map(lambda el: el[0] + '_' + el[1], v))} for k,v in tmp.items()]
  105. final.sort(key=lambda el: el['lemma']['forma_standard'])
  106. finalB.sort(key=lambda el: el['lemma']['forma_standard'])
  107. print(len(final))
  108. print(len(finalB))
  109. # %%
  110. for ii, item in enumerate(finalB):
  111. item['id'] = ii
  112. # %%
  113. # IPERLEMMI
  114. preprefinalIPER = list(filter(lambda row: 'IPERLEMMA' in row[1], redundantLemmas))
  115. prefinalIPER = list(map(lambda row: (row[2], row[3], row[4]), preprefinalIPER))
  116. finalIPER = list(set(prefinalIPER))
  117. finalIPER.sort()
  118. # %%
  119. lem_data_unique = open(baseDir + 'lem_unique.csv', 'w')
  120. csvwriter = csv.writer(lem_data_unique)
  121. csvwriter.writerow(['lemma', 'categoria grammaticale', 'note'])
  122. for line in final:
  123. csvwriter.writerow(line)
  124. lem_data_unique.close()
  125. # IPERLEMMI
  126. iperlem_data_unique = open(baseDir + 'iperlem_unique.csv', 'w')
  127. csvwriter = csv.writer(iperlem_data_unique)
  128. csvwriter.writerow(['iperlemma', 'categoria grammaticale', 'note'])
  129. for line in finalIPER:
  130. #print(line)
  131. csvwriter.writerow(line)
  132. iperlem_data_unique.close()
  133. # %%
  134. #cat_gramm = set(map(lambda entry: entry[1], final))
  135. #cat_gramm2 = list(cat_gramm)
  136. #cat_gramm2.sort()
  137. #print(cat_gramm2)
  138. # %%
  139. final[2]
  140. # %%
  141. with open(baseDir + "power_lemmario.json", "w") as outfile:
  142. json.dump(final, outfile, indent=2)
  143. # %%
  144. with open(baseDir + "power_lemmarioB.json", "w") as outfile:
  145. json.dump(finalB, outfile, indent=2)
  146. # %%