xmlgat_to_EVT_parser.py 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172
  1. # %%
  2. # Imports
  3. import xml.etree.ElementTree as ET
  4. import re
  5. import json
  6. # %%
  7. # This is to handle the xmnls attribute in the TEI element in the templates
  8. uri1 = "{http://www.tei-c.org/ns/1.0}"
  9. namespaces = {
  10. '': "http://www.tei-c.org/ns/1.0",
  11. }
  12. for prefix, uri in namespaces.items():
  13. ET.register_namespace(prefix, uri)
  14. # Reference directories
  15. basedir = '../../Data/'
  16. baseindir = basedir + 'DallOVI/datiniXML/xmlgat/'
  17. baseoutdir = basedir + 'DallOVI/datiniXML/xmlevt/'
  18. # %%
  19. # Import lems list + xml info file
  20. # get lem list as a json object
  21. lemfile = basedir + 'DallOVI/datiniXML/power_lemmarioB.json'
  22. lems = json.load(open(lemfile, 'r'))
  23. # Get BiblioDatini.xml, extract a list of the <Biblio> nodes with ElementTree
  24. infofile = basedir + 'DallOVI/datiniXML/BiblioDatini.xml'
  25. infotree = ET.parse(infofile)
  26. inforoot = infotree.getroot()
  27. infoBiblioNodeList = list(inforoot.iter('Biblio'))
  28. # %%
  29. # FUNCTIONS TO PROCESS THE XMLGAT FILEs
  30. # Get a lem index
  31. def lemIndex(lem):
  32. for item in lems:
  33. if lem.attrib['n'] in item['coordinate']:
  34. return item['id']
  35. raise ValueError("code " + lem.attrib['n'] + " not found")
  36. # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file
  37. def getBiblioNodeBySigla(sigla):
  38. for node in infoBiblioNodeList:
  39. for child in node:
  40. if child.tag=='sigla' and child.text==sigla:
  41. return node
  42. # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla
  43. def getLetterTreeFromFile(filecode, inputdirectory=baseindir):
  44. fileName = inputdirectory + 'xmlgat.' + filecode + '.xml'
  45. letterTree = ET.parse(fileName)
  46. return letterTree
  47. ##################################
  48. # ELABORATING LEMS IN XMLGAT FILES
  49. ##################################
  50. # PREMISE:
  51. # in the xmlgat files, the <lem> tag doesn't surround lems:
  52. #
  53. # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in:
  54. # <lem>A_LEM
  55. # with no closing </lem>
  56. #
  57. # 2. Multiple-word lems are inside <w> tags immediately following the <lem>, as in:
  58. # <lem><w>A MULTIWORD LEM</w>
  59. # The body of the text is inside a single <div>
  60. # This functions puts all lems inside a <lem> tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the <w> tag.
  61. # Basically:
  62. #
  63. # <lem>A_LEM --> <lem>A_LEM</lem>
  64. # <lem><w>A MULTIWORD LEM</w> --> <lem>A MULTIWORD LEM</lem>
  65. def surroundLems(letterTree):
  66. textRoot = list(letterTree.getroot().iter('div'))[0]
  67. texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w']
  68. doit = False
  69. for node in texttags:
  70. if doit and node.tag=='w':
  71. node.tag = 'NEWlem'
  72. node.attrib = prev_node.attrib
  73. prev_node.tag = 'OLDlem'
  74. if node.tag == 'lem' and node.tail != None:
  75. thelem = re.findall(r'\w+', node.tail)[0] # First word
  76. node.text = thelem
  77. node.tail = node.tail.replace(thelem, '')
  78. doit = False
  79. else:
  80. doit = True
  81. prev_node = node
  82. for node in textRoot.findall('OLDlem'):
  83. textRoot.remove(node)
  84. for node in textRoot.findall('NEWlem'):
  85. node.tag = 'lem'
  86. return textRoot
  87. # This function tries to match a lem inside <lem> node (ElementTree Element object 'node'), by its attribute 'n',
  88. # the a lem in the lem list, the json object 'lems'
  89. def getLemByCode(lem):
  90. for item in lems:
  91. if lem.attrib['n'] in item['coordinate']:
  92. return item
  93. raise ValueError("code " + lem.attrib['n'] + " not found")
  94. # Dictionary assigning to each OVI lem type a tag useful for the final TEI output
  95. lemTypeDict = {'s.m.': "#sm", 's.f.': '#sf', 'antr.': '#antr', 'agg.': '#aggettivo', 'n.g.': '#ng', 'v.': '#verbo'}
  96. # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output
  97. def redefineLems(textRoot, fileCode):
  98. for node in textRoot.iter('lem'):
  99. node.attrib['n'] = fileCode + '_' + node.attrib['n']
  100. thisLem = getLemByCode(node)
  101. lemRef = '#' + str(thisLem['id'])
  102. node.attrib.pop('n')
  103. node.attrib['ref'] = lemRef
  104. #
  105. lemType = thisLem['lemma']['categoria']
  106. sub = ET.SubElement(node, 'lem2')
  107. if lemType=='antr.':
  108. sub.attrib['ref'] = 'persName'
  109. elif lemType=='n.g.':
  110. sub.attrib['ref'] = 'placeName'
  111. elif lemType in lemTypeDict:
  112. sub.attrib['ref'] = lemTypeDict[lemType]
  113. sub.text = node.text
  114. node.text = ''
  115. for node in textRoot.iter('lem2'):
  116. node.tag = 'lem'
  117. # Get the letter template as a string
  118. def getTemplateString():
  119. preLetterTemplateTree = ET.parse('pre_letter_template.xml')
  120. where = list( list( list( preLetterTemplateTree.getroot().iter(uri1+'body') )[0].iter(uri1+'div') )[0].iter(uri1+'p') )[0]
  121. #
  122. ET.SubElement(where, 'letterBody')
  123. #
  124. letterString = ET.tostring(preLetterTemplateTree.getroot(), encoding='unicode', method='xml')
  125. return letterString
  126. # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it
  127. def newProcessFile(filecode, inputdirectory=baseindir):
  128. tree1 = getLetterTreeFromFile(filecode, inputdirectory)
  129. #
  130. textRoot1 = surroundLems(tree1)
  131. #
  132. redefineLems(textRoot1, filecode)
  133. #
  134. indent1 = " "
  135. textString1 = ET.tostring(textRoot1, encoding='unicode', method='xml')
  136. textString2 = textString1.replace("<lb />\n", "<lb />\n"+indent1)
  137. return textString2
  138. # %%
  139. letterTemplateString = getTemplateString()
  140. # %%
  141. # Example
  142. filecodeExample = 'l82'
  143. with open('test.xml', 'w') as f1:
  144. newString = letterTemplateString.replace('<letterBody />', newProcessFile(filecodeExample))
  145. f1.write(newString)
  146. # %%