xmlgat_to_EVT_parser.py 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234
  1. # %%
  2. # Imports
  3. import xml.etree.ElementTree as ET
  4. import re
  5. import json
  6. # %%
  7. # This is to handle the xmnls attribute in the TEI element in the templates
  8. uri1 = "{http://www.tei-c.org/ns/1.0}"
  9. namespaces = {
  10. '': "http://www.tei-c.org/ns/1.0",
  11. }
  12. for prefix, uri in namespaces.items():
  13. ET.register_namespace(prefix, uri)
  14. # Reference directories
  15. basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/'
  16. baseindir = basedir + 'OVI/datiniXML/xmlgat/'
  17. baseoutdir = basedir + 'OVI/datiniXML/xmlevt/'
  18. # /Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML
  19. # %%
  20. # Import lems list + xml info file
  21. # get lem list as a json object
  22. lemfile = '/Users/federicaspinelli/TEAMOVI/Parser/OVI/Lemmi/associazione lemmi - link TLIO/power_lemmarioD_link.json'
  23. lems = json.load(open(lemfile, 'r'))
  24. # Get BiblioDatini.xml, extract a list of the <Biblio> nodes with ElementTree
  25. infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml'
  26. infotree = ET.parse(infofile)
  27. inforoot = infotree.getroot()
  28. infoBiblioNodeList = list(inforoot.iter('Biblio'))
  29. # %%
  30. # FUNCTIONS TO PROCESS THE XMLGAT FILEs
  31. # Get a lem index
  32. def lemIndex(lem):
  33. for item in lems:
  34. if lem.attrib['n'] in item['coordinate']:
  35. return item['id']
  36. raise ValueError("code " + lem.attrib['n'] + " not found")
  37. # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file
  38. def getBiblioNodeBySigla(sigla):
  39. for node in infoBiblioNodeList:
  40. for child in node:
  41. if child.tag=='sigla' and child.text==sigla:
  42. return node
  43. # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla
  44. def getLetterRootFromFile(filecode, inputdirectory=baseindir):
  45. fileName = inputdirectory + 'xmlgat.' + filecode + '.xml'
  46. try:
  47. letterRoot = ET.parse(fileName).getroot()
  48. except ET.ParseError:
  49. with open(fileName, encoding="ISO-8859-1") as fp:
  50. xml_string = fp.read()
  51. xml_string = xml_string.replace('&Csic&c', "<hi rend='italic'>sic</hi>")
  52. # return xml_string
  53. letterRoot = ET.fromstring(xml_string)
  54. return letterRoot
  55. ##################################
  56. # ELABORATING LEMS IN XMLGAT FILES
  57. ##################################
  58. # PREMISE:
  59. # in the xmlgat files, the <lem> tag doesn't surround lems:
  60. #
  61. # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in:
  62. # <lem>A_LEM
  63. # with no closing </lem>
  64. #
  65. # 2. Multiple-word lems are inside <w> tags immediately following the <lem>, as in:
  66. # <lem><w>A MULTIWORD LEM</w>
  67. # The body of the text is inside a single <div>
  68. # This functions puts all lems inside a <lem> tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the <w> tag.
  69. # Basically:
  70. #
  71. # <lem>A_LEM --> <lem>A_LEM</lem>
  72. # <lem><w>A MULTIWORD LEM</w> --> <lem>A MULTIWORD LEM</lem>
  73. def surroundLems(letterRoot):
  74. textRoot = list(letterRoot.iter('div'))[0]
  75. texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w']
  76. doit = False
  77. for node in texttags:
  78. if doit and node.tag=='w':
  79. node.tag = 'NEWlem'
  80. node.attrib = prev_node.attrib
  81. prev_node.tag = 'OLDlem'
  82. if node.tag == 'lem' and node.tail != None:
  83. thelem = re.findall(r'\w+', node.tail)[0] # First word
  84. node.text = thelem
  85. node.tail = node.tail.replace(thelem, '')
  86. doit = False
  87. else:
  88. doit = True
  89. prev_node = node
  90. for node in textRoot.findall('OLDlem'):
  91. textRoot.remove(node)
  92. for node in textRoot.findall('NEWlem'):
  93. node.tag = 'lem'
  94. return textRoot
  95. # This function tries to match a lem inside <lem> node (ElementTree Element object 'node'), by its attribute 'n',
  96. # the a lem in the lem list, the json object 'lems'
  97. def getLemByCode(lem):
  98. for item in lems:
  99. if lem.attrib['n'] in item['coordinate']:
  100. return item
  101. raise ValueError("code " + lem.attrib['n'] + " not found")
  102. # Dictionary assigning to each OVI lem type a tag useful for the final TEI output
  103. lemTypeDict = {'s.m.': "sostantivo maschile", 's.f.': 'sostantivo femminile', 'antr.': 'antroponimo', 'agg.': 'aggettivo', 'n.g.': 'nome di luogo', 'v.': 'verbo'}
  104. # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output
  105. def redefineLems(textRoot, fileCode):
  106. for node in textRoot.iter('lem'):
  107. node.attrib['n'] = fileCode + '_' + node.attrib['n']
  108. thisLem = getLemByCode(node)
  109. lemRef = '#' + str(thisLem['id'])
  110. # node.attrib.pop('n')
  111. lemPos = thisLem['lemma']['categoria']
  112. lemType = thisLem['lemma']['iperlemma']
  113. # lemStandard = thisLem['lemma']['forma_standard']
  114. # lemNote = thisLem['lemma']['note']
  115. #node.attrib['type'] = lemType
  116. # if (lemStandard != ''):
  117. # node.attrib['sameAs'] = lemStandard
  118. # sub = ET.SubElement(node, 'rdg')
  119. # sub.text = lemStandard
  120. # sub.attrib['type'] = 'forma standard'
  121. if lemPos=='antr.':
  122. node.tag = 'persName'
  123. node.attrib['ref'] = lemRef
  124. elif lemPos=='n.g.':
  125. node.tag = 'placeName'
  126. node.attrib['ref'] = lemRef
  127. else:
  128. node.tag = 'w'
  129. node.attrib['ref'] = lemRef
  130. node.attrib['pos'] = lemPos
  131. node.attrib['type'] = lemType
  132. # if (lemNote != ''):
  133. # sub = ET.SubElement(node, 'note')
  134. # sub.text = lemNote
  135. #sub.text = node.text
  136. node.text = node.text
  137. #node.tag = 'note'
  138. #for node in textRoot.iter('lem'):
  139. # node.tag = 'lem'
  140. # node.attrib['ref'] = lemRef
  141. # node.attrib['type'] = lemTypeDict[lemType]
  142. def replacepbcode(textRoot, fileCode):
  143. for ii, node in enumerate(textRoot.iter('pb')):
  144. node.attrib['n'] = fileCode + ' c. ' + str(ii + 1)
  145. node.attrib['xml:id'] = fileCode + '_' + str(ii + 1)
  146. node.attrib['facs'] = fileCode + '_' + str(ii + 1)+'.jpg'
  147. def surroundPages(textRoot):
  148. # Create a new, 'clean', root
  149. newRoot = ET.fromstring("<div/>")
  150. # Add a <p/> to the new root for each page in the old one
  151. for node in textRoot.iter('pb'):
  152. ET.SubElement(newRoot, 'p')
  153. # Fill the pages in the new root
  154. page = None
  155. elementInPage = None
  156. for child in textRoot:
  157. if child.tag=='pb' and page is None:
  158. page = 0
  159. elementInPage = 0
  160. elif child.tag=='pb':
  161. page = page+1
  162. elementInPage = 0
  163. if page is not None and elementInPage is not None and child.tag!='milestone':
  164. newRoot[page].append(child)
  165. newRoot[page][elementInPage].tail = child.tail
  166. elementInPage = elementInPage+1
  167. return newRoot
  168. # Get the letter template as a string
  169. def getTemplateString():
  170. preLetterTemplateTree = ET.parse('/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml')
  171. where = list( list( preLetterTemplateTree.getroot().iter(uri1+'body') )[0].iter(uri1+'div') )[0]
  172. #
  173. ET.SubElement(where, 'letterBody')
  174. #
  175. letterString = ET.tostring(preLetterTemplateTree.getroot(), encoding='unicode', method='xml')
  176. return letterString
  177. # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it
  178. def newProcessFile(filecode, inputdirectory=baseindir):
  179. tree1 = getLetterRootFromFile(filecode, inputdirectory)
  180. #ET.dump(tree1)
  181. #
  182. textRoot1 = surroundLems(tree1)
  183. #
  184. redefineLems(textRoot1, filecode)
  185. #
  186. replacepbcode(textRoot1, filecode)
  187. #
  188. textRoot2 = surroundPages(textRoot1)
  189. #
  190. indent1 = " "
  191. textString1 = ET.tostring(textRoot2, encoding='unicode', method='xml')
  192. textString2 = textString1.replace("<lb />\n", "<lb />\n"+indent1)
  193. return textString2
  194. # %%
  195. letterTemplateString = getTemplateString()
  196. # %%
  197. # Example
  198. filecodeExample = '99b'
  199. with open('/Users/federicaspinelli/TEAMOVI/Parser/DATA/test.xml', 'w') as f1:
  200. newString = letterTemplateString.replace('<letterBody />', newProcessFile(filecodeExample))
  201. f1.write(newString)
  202. # %%
  203. # %%