xmlgat_to_EVT_parser.py 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. # %%
  2. # Imports
  3. from pickle import FALSE, TRUE
  4. import xml.etree.ElementTree as ET
  5. import re
  6. import json
  7. # importing os module
  8. import os
  9. # %%
  10. # This is to handle the xmnls attribute in the TEI element in the templates
  11. uri1 = "{http://www.tei-c.org/ns/1.0}"
  12. namespaces = {
  13. '': "http://www.tei-c.org/ns/1.0",
  14. }
  15. for prefix, uri in namespaces.items():
  16. ET.register_namespace(prefix, uri)
  17. # Reference directories
  18. basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/'
  19. baseindir = basedir + 'OVI/datiniXML/xmlgat/'
  20. baseoutdir = basedir + 'OVI/datiniXML/xmlevt/'
  21. # /Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML
  22. # %%
  23. # Import lems list + xml info file
  24. # get lem list as a json object
  25. lemfile = '/Users/federicaspinelli/TEAMOVI/Parser/OVI/Lemmi/associazione lemmi - link TLIO/power_lemmarioD_link.json'
  26. lems = json.load(open(lemfile, 'r'))
  27. # Get BiblioDatini.xml, extract a list of the <Biblio> nodes with ElementTree
  28. infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml'
  29. infotree = ET.parse(infofile)
  30. inforoot = infotree.getroot()
  31. infoBiblioNodeList = list(inforoot.iter('Biblio'))
  32. # %%
  33. # FUNCTIONS TO PROCESS THE XMLGAT FILEs
  34. # Get a lem index
  35. def lemIndex(lem):
  36. for item in lems:
  37. if lem.attrib['n'] in item['coordinate']:
  38. return item['id']
  39. raise ValueError("code " + lem.attrib['n'] + " not found")
  40. # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file
  41. def getBiblioNodeBySigla(sigla):
  42. for node in infoBiblioNodeList:
  43. for child in node:
  44. if child.tag == 'sigla' and child.text == sigla:
  45. return node
  46. def getBiblioNodeByCodice(segnatura):
  47. for node in infoBiblioNodeList:
  48. for child in node:
  49. if child.tag == 'segnatura' and child.text == segnatura:
  50. return node
  51. # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla
  52. def getLetterRootFromFile(filecode, inputdirectory=baseindir):
  53. fileName = inputdirectory + 'xmlgat.' + filecode + '.xml'
  54. try:
  55. letterRoot = ET.parse(fileName).getroot()
  56. except ET.ParseError:
  57. with open(fileName, encoding="ISO-8859-1") as fp:
  58. xml_string = fp.read()
  59. xml_string = xml_string.replace(
  60. '&Csic&c', "<hi rend='italic'>sic</hi>")
  61. # return xml_string
  62. letterRoot = ET.fromstring(xml_string)
  63. return letterRoot
  64. ##################################
  65. # ELABORATING LEMS IN XMLGAT FILES
  66. ##################################
  67. # PREMISE:
  68. # in the xmlgat files, the <lem> tag doesn't surround lems:
  69. #
  70. # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in:
  71. # <lem>A_LEM
  72. # with no closing </lem>
  73. #
  74. # 2. Multiple-word lems are inside <w> tags immediately following the <lem>, as in:
  75. # <lem><w>A MULTIWORD LEM</w>
  76. # The body of the text is inside a single <div>
  77. # This functions puts all lems inside a <lem> tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the <w> tag.
  78. # Basically:
  79. #
  80. # <lem>A_LEM --> <lem>A_LEM</lem>
  81. # <lem><w>A MULTIWORD LEM</w> --> <lem>A MULTIWORD LEM</lem>
  82. def surroundLems(letterRoot):
  83. textRoot = list(letterRoot.iter('div'))[0]
  84. texttags = [node for node in textRoot if node.tag ==
  85. 'lem' or node.tag == 'w']
  86. doit = False
  87. for node in texttags:
  88. if doit and node.tag == 'w':
  89. node.tag = 'NEWlem'
  90. node.attrib = prev_node.attrib
  91. prev_node.tag = 'OLDlem'
  92. if node.tag == 'lem' and node.tail != None:
  93. thelem = re.findall(r'\w+', node.tail)[0] # First word
  94. node.text = thelem
  95. node.tail = node.tail.replace(thelem, '')
  96. doit = False
  97. else:
  98. doit = True
  99. prev_node = node
  100. for node in textRoot.findall('OLDlem'):
  101. textRoot.remove(node)
  102. for node in textRoot.findall('NEWlem'):
  103. node.tag = 'lem'
  104. return textRoot
  105. # This function tries to match a lem inside <lem> node (ElementTree Element object 'node'), by its attribute 'n',
  106. # the a lem in the lem list, the json object 'lems'
  107. def getLemByCode(lem):
  108. for item in lems:
  109. if lem.attrib['n'] in item['coordinate']:
  110. return item
  111. raise ValueError("code " + lem.attrib['n'] + " not found")
  112. # Dictionary assigning to each OVI lem type a tag useful for the final TEI output
  113. lemTypeDict = {'s.m.': "sostantivo maschile", 's.f.': 'sostantivo femminile',
  114. 'antr.': 'antroponimo', 'agg.': 'aggettivo', 'n.g.': 'nome di luogo', 'v.': 'verbo'}
  115. # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output
  116. def redefineLems(textRoot, fileCode):
  117. for node in textRoot.iter('lem'):
  118. node.attrib['n'] = fileCode + '_' + node.attrib['n']
  119. thisLem = getLemByCode(node)
  120. lemRef = '#' + str(thisLem['id'])
  121. # node.attrib.pop('n')
  122. lemPos = thisLem['lemma']['categoria']
  123. lemType = thisLem['lemma']['iperlemma']
  124. # lemStandard = thisLem['lemma']['forma_standard']
  125. # lemNote = thisLem['lemma']['note']
  126. #node.attrib['type'] = lemType
  127. # if (lemStandard != ''):
  128. # node.attrib['sameAs'] = lemStandard
  129. # sub = ET.SubElement(node, 'rdg')
  130. # sub.text = lemStandard
  131. # sub.attrib['type'] = 'forma standard'
  132. if lemPos == 'antr.':
  133. node.tag = 'persName'
  134. node.attrib['ref'] = lemRef
  135. elif lemPos == 'n.g.':
  136. node.tag = 'placeName'
  137. node.attrib['ref'] = lemRef
  138. else:
  139. node.tag = 'w'
  140. node.attrib['ref'] = lemRef
  141. node.attrib['pos'] = lemPos
  142. node.attrib['type'] = lemType
  143. # if (lemNote != ''):
  144. # sub = ET.SubElement(node, 'note')
  145. # sub.text = lemNote
  146. #sub.text = node.text
  147. node.text = node.text
  148. #node.tag = 'note'
  149. # for node in textRoot.iter('lem'):
  150. # node.tag = 'lem'
  151. # node.attrib['ref'] = lemRef
  152. # node.attrib['type'] = lemTypeDict[lemType]
  153. def replacepbcode(textRoot, fileCode):
  154. for ii, node in enumerate(textRoot.iter('pb')):
  155. node.attrib['n'] = fileCode + ' c. ' + str(ii + 1)
  156. node.attrib['xml:id'] = fileCode + '_' + str(ii + 1)
  157. # filecodeupper = (fileCode + '_' + str(ii + 1)).upper()
  158. # imgjpg = filecodeupper+'.jpg'
  159. # def replaceimg(textRoot, fileCode):
  160. # folder = "/Volumes/GoogleDrive-117836417327186331381/Il mio Drive/OVI-CNR/images-all/"
  161. # listfolder = os.listdir(folder)
  162. # files_dir = [f for f in listfolder if os.path.isdir(os.path.join(folder, f))]
  163. # for ii, node in enumerate(textRoot.iter('pb')):
  164. # pbfile = fileCode
  165. # print("PB "+pbfile)
  166. # filecodeupper = (fileCode + '_0' + str(ii + 1)).upper()+'.jpg'
  167. # done = FALSE
  168. # for imgfile in files_dir:
  169. # if imgfile == pbfile:
  170. # node.attrib['facs'] = filecodeupper
  171. # done = TRUE
  172. # print("found "+filecodeupper)
  173. # if imgfile != pbfile:
  174. # if imgfile != filecodeupper:
  175. # node.attrib['facs'] = "NO_IMAGE.jpg"
  176. # node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg'
  177. # print("not found "+filecodeupper)
  178. def replaceimg(textRoot, fileCode):
  179. folder = "/Volumes/GoogleDrive-117836417327186331381/Il mio Drive/OVI-CNR/images-all/"
  180. files = os.listdir(folder)
  181. for ii, node in enumerate(textRoot.iter('pb')):
  182. filecodeupper = (fileCode + '_0' + str(ii + 1)).upper()+'.jpg'
  183. foldercode = fileCode.upper()
  184. done = FALSE
  185. for imgfile in files:
  186. if imgfile == foldercode:
  187. node.attrib['facs'] = foldercode+"/"+filecodeupper
  188. done = TRUE
  189. print("found "+ filecodeupper)
  190. # print("found "+filecodeupper)
  191. if done != TRUE:
  192. node.attrib['facs'] = "NO_IMAGE/NO_IMAGE.jpg"
  193. print("not found "+filecodeupper)
  194. # if imgfile != filecodeupper:
  195. # node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg'
  196. # print("not found "+filecodeupper)
  197. def surroundPages(textRoot):
  198. # Create a new, 'clean', root
  199. newRoot = ET.fromstring("<div/>")
  200. # Add a <p/> to the new root for each page in the old one
  201. for node in textRoot.iter('pb'):
  202. ET.SubElement(newRoot, 'p')
  203. # Fill the pages in the new root
  204. page = None
  205. elementInPage = None
  206. for child in textRoot:
  207. if child.tag == 'pb' and page is None:
  208. page = 0
  209. elementInPage = 0
  210. elif child.tag == 'pb':
  211. page = page+1
  212. elementInPage = 0
  213. if page is not None and elementInPage is not None and child.tag != 'milestone':
  214. newRoot[page].append(child)
  215. newRoot[page][elementInPage].tail = child.tail
  216. elementInPage = elementInPage+1
  217. return newRoot
  218. # Get the letter template as a string
  219. def getTemplateString():
  220. preLetterTemplateTree = ET.parse(
  221. '/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml')
  222. where = list(list(preLetterTemplateTree.getroot().iter(
  223. uri1+'body'))[0].iter(uri1+'div'))[0]
  224. #
  225. ET.SubElement(where, 'letterBody')
  226. #
  227. letterString = ET.tostring(
  228. preLetterTemplateTree.getroot(), encoding='unicode', method='xml')
  229. return letterString
  230. # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it
  231. def newProcessFile(filecode, inputdirectory=baseindir):
  232. tree1 = getLetterRootFromFile(filecode, inputdirectory)
  233. # ET.dump(tree1)
  234. #
  235. textRoot1 = surroundLems(tree1)
  236. #
  237. redefineLems(textRoot1, filecode)
  238. #
  239. replacepbcode(textRoot1, filecode)
  240. replaceimg(textRoot1, filecode)
  241. #
  242. textRoot2 = surroundPages(textRoot1)
  243. #
  244. indent1 = " "
  245. textString1 = ET.tostring(textRoot2, encoding='unicode', method='xml')
  246. textString2 = textString1.replace("<lb />\n", "<lb />\n"+indent1)
  247. return textString2
  248. # %%
  249. letterTemplateString = getTemplateString()
  250. # %%
  251. # Example
  252. filecodeExample = 'z99'
  253. with open('/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/test.xml', 'w') as f1:
  254. newString = letterTemplateString.replace(
  255. '<letterBody />', newProcessFile(filecodeExample))
  256. f1.write(newString)
  257. # %%
  258. # %%