123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172 |
- # %%
- # Imports
- import xml.etree.ElementTree as ET
- import re
- import json
- # %%
- # This is to handle the xmnls attribute in the TEI element in the templates
- uri1 = "{http://www.tei-c.org/ns/1.0}"
- namespaces = {
- '': "http://www.tei-c.org/ns/1.0",
- }
- for prefix, uri in namespaces.items():
- ET.register_namespace(prefix, uri)
- # Reference directories
- basedir = '../../Data/'
- baseindir = basedir + 'DallOVI/datiniXML/xmlgat/'
- baseoutdir = basedir + 'DallOVI/datiniXML/xmlevt/'
- # %%
- # Import lems list + xml info file
- # get lem list as a json object
- lemfile = basedir + 'DallOVI/datiniXML/power_lemmarioB.json'
- lems = json.load(open(lemfile, 'r'))
- # Get BiblioDatini.xml, extract a list of the <Biblio> nodes with ElementTree
- infofile = basedir + 'DallOVI/datiniXML/BiblioDatini.xml'
- infotree = ET.parse(infofile)
- inforoot = infotree.getroot()
- infoBiblioNodeList = list(inforoot.iter('Biblio'))
- # %%
- # FUNCTIONS TO PROCESS THE XMLGAT FILEs
- # Get a lem index
- def lemIndex(lem):
- for item in lems:
- if lem.attrib['n'] in item['coordinate']:
- return item['id']
- raise ValueError("code " + lem.attrib['n'] + " not found")
- # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file
- def getBiblioNodeBySigla(sigla):
- for node in infoBiblioNodeList:
- for child in node:
- if child.tag=='sigla' and child.text==sigla:
- return node
- # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla
- def getLetterTreeFromFile(filecode, inputdirectory=baseindir):
- fileName = inputdirectory + 'xmlgat.' + filecode + '.xml'
- letterTree = ET.parse(fileName)
- return letterTree
- ##################################
- # ELABORATING LEMS IN XMLGAT FILES
- ##################################
- # PREMISE:
- # in the xmlgat files, the <lem> tag doesn't surround lems:
- #
- # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in:
- # <lem>A_LEM
- # with no closing </lem>
- #
- # 2. Multiple-word lems are inside <w> tags immediately following the <lem>, as in:
- # <lem><w>A MULTIWORD LEM</w>
- # The body of the text is inside a single <div>
- # This functions puts all lems inside a <lem> tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the <w> tag.
- # Basically:
- #
- # <lem>A_LEM --> <lem>A_LEM</lem>
- # <lem><w>A MULTIWORD LEM</w> --> <lem>A MULTIWORD LEM</lem>
- def surroundLems(letterTree):
- textRoot = list(letterTree.getroot().iter('div'))[0]
- texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w']
- doit = False
- for node in texttags:
- if doit and node.tag=='w':
- node.tag = 'NEWlem'
- node.attrib = prev_node.attrib
- prev_node.tag = 'OLDlem'
- if node.tag == 'lem' and node.tail != None:
- thelem = re.findall(r'\w+', node.tail)[0] # First word
- node.text = thelem
- node.tail = node.tail.replace(thelem, '')
- doit = False
- else:
- doit = True
- prev_node = node
- for node in textRoot.findall('OLDlem'):
- textRoot.remove(node)
- for node in textRoot.findall('NEWlem'):
- node.tag = 'lem'
- return textRoot
- # This function tries to match a lem inside <lem> node (ElementTree Element object 'node'), by its attribute 'n',
- # the a lem in the lem list, the json object 'lems'
- def getLemByCode(lem):
- for item in lems:
- if lem.attrib['n'] in item['coordinate']:
- return item
- raise ValueError("code " + lem.attrib['n'] + " not found")
- # Dictionary assigning to each OVI lem type a tag useful for the final TEI output
- lemTypeDict = {'s.m.': "#sm", 's.f.': '#sf', 'antr.': '#antr', 'agg.': '#aggettivo', 'n.g.': '#ng', 'v.': '#verbo'}
- # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output
- def redefineLems(textRoot, fileCode):
- for node in textRoot.iter('lem'):
- node.attrib['n'] = fileCode + '_' + node.attrib['n']
- thisLem = getLemByCode(node)
- lemRef = '#' + str(thisLem['id'])
- node.attrib.pop('n')
- node.attrib['ref'] = lemRef
- #
- lemType = thisLem['lemma']['categoria']
- sub = ET.SubElement(node, 'lem2')
- if lemType=='antr.':
- sub.attrib['ref'] = 'persName'
- elif lemType=='n.g.':
- sub.attrib['ref'] = 'placeName'
- elif lemType in lemTypeDict:
- sub.attrib['ref'] = lemTypeDict[lemType]
- sub.text = node.text
- node.text = ''
- for node in textRoot.iter('lem2'):
- node.tag = 'lem'
- # Get the letter template as a string
- def getTemplateString():
- preLetterTemplateTree = ET.parse('pre_letter_template.xml')
- where = list( list( list( preLetterTemplateTree.getroot().iter(uri1+'body') )[0].iter(uri1+'div') )[0].iter(uri1+'p') )[0]
- #
- ET.SubElement(where, 'letterBody')
- #
- letterString = ET.tostring(preLetterTemplateTree.getroot(), encoding='unicode', method='xml')
- return letterString
- # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it
- def newProcessFile(filecode, inputdirectory=baseindir):
- tree1 = getLetterTreeFromFile(filecode, inputdirectory)
- #
- textRoot1 = surroundLems(tree1)
- #
- redefineLems(textRoot1, filecode)
- #
- indent1 = " "
- textString1 = ET.tostring(textRoot1, encoding='unicode', method='xml')
- textString2 = textString1.replace("<lb />\n", "<lb />\n"+indent1)
- return textString2
- # %%
- letterTemplateString = getTemplateString()
- # %%
- # Example
- filecodeExample = 'l82'
- with open('test.xml', 'w') as f1:
- newString = letterTemplateString.replace('<letterBody />', newProcessFile(filecodeExample))
- f1.write(newString)
- # %%
|