# %% # Imports from pickle import FALSE, TRUE import xml.etree.ElementTree as ET import re import json # importing os module import os # %% # This is to handle the xmnls attribute in the TEI element in the templates uri1 = "{http://www.tei-c.org/ns/1.0}" namespaces = { '': "http://www.tei-c.org/ns/1.0", } for prefix, uri in namespaces.items(): ET.register_namespace(prefix, uri) # Reference directories basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/' baseindir = basedir + 'OVI/datiniXML/xmlgat/' baseoutdir = basedir + 'OVI/datiniXML/xmlevt/' # /Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML # %% # Import lems list + xml info file # get lem list as a json object lemfile = '/Users/federicaspinelli/TEAMOVI/Parser/OVI/Lemmi/associazione lemmi - link TLIO/power_lemmarioD_link.json' lems = json.load(open(lemfile, 'r')) # Get BiblioDatini.xml, extract a list of the nodes with ElementTree infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml' infotree = ET.parse(infofile) inforoot = infotree.getroot() infoBiblioNodeList = list(inforoot.iter('Biblio')) # %% # FUNCTIONS TO PROCESS THE XMLGAT FILEs # Get a lem index def lemIndex(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item['id'] raise ValueError("code " + lem.attrib['n'] + " not found") # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file def getBiblioNodeBySigla(sigla): for node in infoBiblioNodeList: for child in node: if child.tag == 'sigla' and child.text == sigla: return node def getBiblioNodeByCodice(segnatura): for node in infoBiblioNodeList: for child in node: if child.tag == 'segnatura' and child.text == segnatura: return node # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla def getLetterRootFromFile(filecode, inputdirectory=baseindir): fileName = inputdirectory + 'xmlgat.' + filecode + '.xml' try: letterRoot = ET.parse(fileName).getroot() except ET.ParseError: with open(fileName, encoding="ISO-8859-1") as fp: xml_string = fp.read() xml_string = xml_string.replace( '&Csic&c', "sic") # return xml_string letterRoot = ET.fromstring(xml_string) return letterRoot ################################## # ELABORATING LEMS IN XMLGAT FILES ################################## # PREMISE: # in the xmlgat files, the tag doesn't surround lems: # # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in: # A_LEM # with no closing # # 2. Multiple-word lems are inside tags immediately following the , as in: # A MULTIWORD LEM # The body of the text is inside a single
# This functions puts all lems inside a tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the tag. # Basically: # # A_LEM --> A_LEM # A MULTIWORD LEM --> A MULTIWORD LEM def surroundLems(letterRoot): textRoot = list(letterRoot.iter('div'))[0] texttags = [node for node in textRoot if node.tag == 'lem' or node.tag == 'w'] doit = False for node in texttags: if doit and node.tag == 'w': node.tag = 'NEWlem' node.attrib = prev_node.attrib prev_node.tag = 'OLDlem' if node.tag == 'lem' and node.tail != None: thelem = re.findall(r'\w+', node.tail)[0] # First word node.text = thelem node.tail = node.tail.replace(thelem, '') doit = False else: doit = True prev_node = node for node in textRoot.findall('OLDlem'): textRoot.remove(node) for node in textRoot.findall('NEWlem'): node.tag = 'lem' return textRoot # This function tries to match a lem inside node (ElementTree Element object 'node'), by its attribute 'n', # the a lem in the lem list, the json object 'lems' def getLemByCode(lem): for item in lems: if lem.attrib['n'] in item['coordinate']: return item raise ValueError("code " + lem.attrib['n'] + " not found") # Dictionary assigning to each OVI lem type a tag useful for the final TEI output lemTypeDict = {'s.m.': "sostantivo maschile", 's.f.': 'sostantivo femminile', 'antr.': 'antroponimo', 'agg.': 'aggettivo', 'n.g.': 'nome di luogo', 'v.': 'verbo'} # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output def redefineLems(textRoot, fileCode): for node in textRoot.iter('lem'): node.attrib['n'] = fileCode + '_' + node.attrib['n'] thisLem = getLemByCode(node) lemRef = '#' + str(thisLem['id']) # node.attrib.pop('n') lemPos = thisLem['lemma']['categoria'] lemType = thisLem['lemma']['iperlemma'] # lemStandard = thisLem['lemma']['forma_standard'] # lemNote = thisLem['lemma']['note'] #node.attrib['type'] = lemType # if (lemStandard != ''): # node.attrib['sameAs'] = lemStandard # sub = ET.SubElement(node, 'rdg') # sub.text = lemStandard # sub.attrib['type'] = 'forma standard' if lemPos == 'antr.': node.tag = 'persName' node.attrib['ref'] = lemRef elif lemPos == 'n.g.': node.tag = 'placeName' node.attrib['ref'] = lemRef else: node.tag = 'w' node.attrib['ref'] = lemRef node.attrib['pos'] = lemPos node.attrib['type'] = lemType # if (lemNote != ''): # sub = ET.SubElement(node, 'note') # sub.text = lemNote #sub.text = node.text node.text = node.text #node.tag = 'note' # for node in textRoot.iter('lem'): # node.tag = 'lem' # node.attrib['ref'] = lemRef # node.attrib['type'] = lemTypeDict[lemType] def replacepbcode(textRoot, fileCode): for ii, node in enumerate(textRoot.iter('pb')): node.attrib['n'] = fileCode + ' c. ' + str(ii + 1) node.attrib['xml:id'] = fileCode + '_' + str(ii + 1) # filecodeupper = (fileCode + '_' + str(ii + 1)).upper() # imgjpg = filecodeupper+'.jpg' # def replaceimg(textRoot, fileCode): # folder = "/Volumes/GoogleDrive-117836417327186331381/Il mio Drive/OVI-CNR/images-all/" # listfolder = os.listdir(folder) # files_dir = [f for f in listfolder if os.path.isdir(os.path.join(folder, f))] # for ii, node in enumerate(textRoot.iter('pb')): # pbfile = fileCode # print("PB "+pbfile) # filecodeupper = (fileCode + '_0' + str(ii + 1)).upper()+'.jpg' # done = FALSE # for imgfile in files_dir: # if imgfile == pbfile: # node.attrib['facs'] = filecodeupper # done = TRUE # print("found "+filecodeupper) # if imgfile != pbfile: # if imgfile != filecodeupper: # node.attrib['facs'] = "NO_IMAGE.jpg" # node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg' # print("not found "+filecodeupper) def replaceimg(textRoot, fileCode): folder = "/Volumes/GoogleDrive-117836417327186331381/Il mio Drive/OVI-CNR/images-all/" files = os.listdir(folder) for ii, node in enumerate(textRoot.iter('pb')): filecodeupper = (fileCode + '_0' + str(ii + 1)).upper()+'.jpg' foldercode = fileCode.upper() done = FALSE for imgfile in files: if imgfile == foldercode: node.attrib['facs'] = foldercode+"/"+filecodeupper done = TRUE print("found "+ filecodeupper) # print("found "+filecodeupper) if done != TRUE: node.attrib['facs'] = "NO_IMAGE/NO_IMAGE.jpg" print("not found "+filecodeupper) # if imgfile != filecodeupper: # node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg' # print("not found "+filecodeupper) def surroundPages(textRoot): # Create a new, 'clean', root newRoot = ET.fromstring("
") # Add a

to the new root for each page in the old one for node in textRoot.iter('pb'): ET.SubElement(newRoot, 'p') # Fill the pages in the new root page = None elementInPage = None for child in textRoot: if child.tag == 'pb' and page is None: page = 0 elementInPage = 0 elif child.tag == 'pb': page = page+1 elementInPage = 0 if page is not None and elementInPage is not None and child.tag != 'milestone': newRoot[page].append(child) newRoot[page][elementInPage].tail = child.tail elementInPage = elementInPage+1 return newRoot # Get the letter template as a string def getTemplateString(): preLetterTemplateTree = ET.parse( '/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml') where = list(list(preLetterTemplateTree.getroot().iter( uri1+'body'))[0].iter(uri1+'div'))[0] # ET.SubElement(where, 'letterBody') # letterString = ET.tostring( preLetterTemplateTree.getroot(), encoding='unicode', method='xml') return letterString # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it def newProcessFile(filecode, inputdirectory=baseindir): tree1 = getLetterRootFromFile(filecode, inputdirectory) # ET.dump(tree1) # textRoot1 = surroundLems(tree1) # redefineLems(textRoot1, filecode) # replacepbcode(textRoot1, filecode) replaceimg(textRoot1, filecode) # textRoot2 = surroundPages(textRoot1) # indent1 = " " textString1 = ET.tostring(textRoot2, encoding='unicode', method='xml') textString2 = textString1.replace("\n", "\n"+indent1) return textString2 # %% letterTemplateString = getTemplateString() # %% # Example filecodeExample = 'z99' with open('/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/test.xml', 'w') as f1: newString = letterTemplateString.replace( '', newProcessFile(filecodeExample)) f1.write(newString) # %% # %%