# %% # Imports import xml.etree.ElementTree as ET import re import json # %% # This is to handle the xmnls attribute in the TEI element in the templates uri1 = "{http://www.tei-c.org/ns/1.0}" namespaces = { '': "http://www.tei-c.org/ns/1.0", } for prefix, uri in namespaces.items(): ET.register_namespace(prefix, uri) # Reference directories basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/' baseindir = basedir + 'OVI/datiniXML/xmlgat/' baseoutdir = basedir + 'OVI/datiniXML/xmlevt/' # /Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML # %% # Import lems list + xml info file # get lem list as a json object lemfile = '/Users/federicaspinelli/TEAMOVI/Parser/OVI/Lemmi/associazione lemmi - link TLIO/power_lemmarioD_link.json' lems = json.load(open(lemfile, 'r')) # Get BiblioDatini.xml, extract a list of the nodes with ElementTree infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml' infotree = ET.parse(infofile) inforoot = infotree.getroot() infoBiblioNodeList = list(inforoot.iter('Biblio')) # %% # FUNCTIONS TO PROCESS THE XMLGAT FILEs # Get a lem index def lemIndex(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item['id'] raise ValueError("code " + lem.attrib['n'] + " not found") # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file def getBiblioNodeBySigla(sigla): for node in infoBiblioNodeList: for child in node: if child.tag=='sigla' and child.text==sigla: return node # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla def getLetterRootFromFile(filecode, inputdirectory=baseindir): fileName = inputdirectory + 'xmlgat.' + filecode + '.xml' try: letterRoot = ET.parse(fileName).getroot() except ET.ParseError: with open(fileName, encoding="ISO-8859-1") as fp: xml_string = fp.read() xml_string = xml_string.replace('&Csic&c', "sic") # return xml_string letterRoot = ET.fromstring(xml_string) return letterRoot ################################## # ELABORATING LEMS IN XMLGAT FILES ################################## # PREMISE: # in the xmlgat files, the tag doesn't surround lems: # # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in: # A_LEM # with no closing # # 2. Multiple-word lems are inside tags immediately following the , as in: # A MULTIWORD LEM # The body of the text is inside a single
# This functions puts all lems inside a tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the tag. # Basically: # # A_LEM --> A_LEM # A MULTIWORD LEM --> A MULTIWORD LEM def surroundLems(letterRoot): textRoot = list(letterRoot.iter('div'))[0] texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w'] doit = False for node in texttags: if doit and node.tag=='w': node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textRoot.findall('OLDlem'): textRoot.remove(node) for node in textRoot.findall('NEWlem'): node.tag = 'lem' return textRoot # This function tries to match a lem inside node (ElementTree Element object 'node'), by its attribute 'n', # the a lem in the lem list, the json object 'lems' def getLemByCode(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item raise ValueError("code " + lem.attrib['n'] + " not found") # Dictionary assigning to each OVI lem type a tag useful for the final TEI output lemTypeDict = {'s.m.': "sostantivo maschile", 's.f.': 'sostantivo femminile', 'antr.': 'antroponimo', 'agg.': 'aggettivo', 'n.g.': 'nome di luogo', 'v.': 'verbo'} # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output def redefineLems(textRoot, fileCode): for node in textRoot.iter('lem'): node.attrib['n'] = fileCode + '_' + node.attrib['n'] thisLem = getLemByCode(node) lemRef = '#' + str(thisLem['id']) # node.attrib.pop('n') lemPos = thisLem['lemma']['categoria'] lemType = thisLem['lemma']['iperlemma'] # lemStandard = thisLem['lemma']['forma_standard'] # lemNote = thisLem['lemma']['note'] #node.attrib['type'] = lemType # if (lemStandard != ''): # node.attrib['sameAs'] = lemStandard # sub = ET.SubElement(node, 'rdg') # sub.text = lemStandard # sub.attrib['type'] = 'forma standard' if lemPos=='antr.': node.tag = 'persName' node.attrib['ref'] = lemRef elif lemPos=='n.g.': node.tag = 'placeName' node.attrib['ref'] = lemRef else: node.tag = 'w' node.attrib['ref'] = lemRef node.attrib['pos'] = lemPos node.attrib['type'] = lemType # if (lemNote != ''): # sub = ET.SubElement(node, 'note') # sub.text = lemNote #sub.text = node.text node.text = node.text #node.tag = 'note' #for node in textRoot.iter('lem'): # node.tag = 'lem' # node.attrib['ref'] = lemRef # node.attrib['type'] = lemTypeDict[lemType] def replacepbcode(textRoot, fileCode): for ii, node in enumerate(textRoot.iter('pb')): node.attrib['n'] = fileCode + ' c. ' + str(ii + 1) node.attrib['xml:id'] = fileCode + '_' + str(ii + 1) node.attrib['facs'] = fileCode + '_' + str(ii + 1)+'.jpg' def surroundPages(textRoot): # Create a new, 'clean', root newRoot = ET.fromstring("
") # Add a

to the new root for each page in the old one for node in textRoot.iter('pb'): ET.SubElement(newRoot, 'p') # Fill the pages in the new root page = None elementInPage = None for child in textRoot: if child.tag=='pb' and page is None: page = 0 elementInPage = 0 elif child.tag=='pb': page = page+1 elementInPage = 0 if page is not None and elementInPage is not None and child.tag!='milestone': newRoot[page].append(child) newRoot[page][elementInPage].tail = child.tail elementInPage = elementInPage+1 return newRoot # Get the letter template as a string def getTemplateString(): preLetterTemplateTree = ET.parse('/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml') where = list( list( preLetterTemplateTree.getroot().iter(uri1+'body') )[0].iter(uri1+'div') )[0] # ET.SubElement(where, 'letterBody') # letterString = ET.tostring(preLetterTemplateTree.getroot(), encoding='unicode', method='xml') return letterString # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it def newProcessFile(filecode, inputdirectory=baseindir): tree1 = getLetterRootFromFile(filecode, inputdirectory) #ET.dump(tree1) # textRoot1 = surroundLems(tree1) # redefineLems(textRoot1, filecode) # replacepbcode(textRoot1, filecode) # textRoot2 = surroundPages(textRoot1) # indent1 = " " textString1 = ET.tostring(textRoot2, encoding='unicode', method='xml') textString2 = textString1.replace("\n", "\n"+indent1) return textString2 # %% letterTemplateString = getTemplateString() # %% # Example filecodeExample = '99b' with open('/Users/federicaspinelli/TEAMOVI/Parser/DATA/test.xml', 'w') as f1: newString = letterTemplateString.replace('', newProcessFile(filecodeExample)) f1.write(newString) # %% # %%