# %% # Imports import xml.etree.ElementTree as ET import re import json # %% # This is to handle the xmnls attribute in the TEI element in the templates uri1 = "{http://www.tei-c.org/ns/1.0}" namespaces = { '': "http://www.tei-c.org/ns/1.0", } for prefix, uri in namespaces.items(): ET.register_namespace(prefix, uri) # Reference directories basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/' baseindir = basedir + 'OVI/datiniXML/xmlgat/' baseoutdir = basedir + 'OVI/datiniXML/xmlevt/' # %% # Import lems list + xml info file # get lem list as a json object lemfile = basedir + 'OVI/datiniXML/power_lemmarioB.json' lems = json.load(open(lemfile, 'r')) # Get BiblioDatini.xml, extract a list of the nodes with ElementTree infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml' infotree = ET.parse(infofile) inforoot = infotree.getroot() infoBiblioNodeList = list(inforoot.iter('Biblio')) # %% # FUNCTIONS TO PROCESS THE XMLGAT FILEs # Get a lem index def lemIndex(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item['id'] raise ValueError("code " + lem.attrib['n'] + " not found") # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file def getBiblioNodeBySigla(sigla): for node in infoBiblioNodeList: for child in node: if child.tag=='sigla' and child.text==sigla: return node # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla def getLetterTreeFromFile(filecode, inputdirectory=baseindir): fileName = inputdirectory + 'xmlgat.' + filecode + '.xml' letterTree = ET.parse(fileName) return letterTree ################################## # ELABORATING LEMS IN XMLGAT FILES ################################## # PREMISE: # in the xmlgat files, the tag doesn't surround lems: # # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in: # A_LEM # with no closing # # 2. Multiple-word lems are inside tags immediately following the , as in: # A MULTIWORD LEM # The body of the text is inside a single
# This functions puts all lems inside a tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the tag. # Basically: # # A_LEM --> A_LEM # A MULTIWORD LEM --> A MULTIWORD LEM def surroundLems(letterTree): textRoot = list(letterTree.getroot().iter('div'))[0] texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w'] doit = False for node in texttags: if doit and node.tag=='w': node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textRoot.findall('OLDlem'): textRoot.remove(node) for node in textRoot.findall('NEWlem'): node.tag = 'lem' return textRoot # This function tries to match a lem inside node (ElementTree Element object 'node'), by its attribute 'n', # the a lem in the lem list, the json object 'lems' def getLemByCode(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item raise ValueError("code " + lem.attrib['n'] + " not found") # Dictionary assigning to each OVI lem type a tag useful for the final TEI output lemTypeDict = {'s.m.': "#sm", 's.f.': '#sf', 'antr.': '#antr', 'agg.': '#aggettivo', 'n.g.': '#ng', 'v.': '#verbo'} # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output def redefineLems(textRoot, fileCode): for node in textRoot.iter('lem'): node.attrib['n'] = fileCode + '_' + node.attrib['n'] thisLem = getLemByCode(node) lemRef = '#' + str(thisLem['id']) node.attrib.pop('n') node.attrib['ref'] = lemRef # lemType = thisLem['lemma']['categoria'] sub = ET.SubElement(node, 'lem2') if lemType=='antr.': sub.attrib['ref'] = 'persName' elif lemType=='n.g.': sub.attrib['ref'] = 'placeName' elif lemType in lemTypeDict: sub.attrib['ref'] = lemTypeDict[lemType] sub.text = node.text node.text = '' for node in textRoot.iter('lem2'): node.tag = 'lem' # Get the letter template as a string def getTemplateString(): preLetterTemplateTree = ET.parse('/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml') where = list( list( list( preLetterTemplateTree.getroot().iter(uri1+'body') )[0].iter(uri1+'div') )[0].iter(uri1+'p') )[0] # ET.SubElement(where, 'letterBody') # letterString = ET.tostring(preLetterTemplateTree.getroot(), encoding='unicode', method='xml') return letterString # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it def newProcessFile(filecode, inputdirectory=baseindir): tree1 = getLetterTreeFromFile(filecode, inputdirectory) # textRoot1 = surroundLems(tree1) # redefineLems(textRoot1, filecode) # indent1 = " " textString1 = ET.tostring(textRoot1, encoding='unicode', method='xml') textString2 = textString1.replace("\n", "\n"+indent1) return textString2 # %% letterTemplateString = getTemplateString() # %% # Example filecodeExample = 'l82' with open('test.xml', 'w') as f1: newString = letterTemplateString.replace('', newProcessFile(filecodeExample)) f1.write(newString) # %%