123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283 |
- # %%
- # Imports
- from pickle import FALSE, TRUE
- import xml.etree.ElementTree as ET
- import re
- import json
- # importing os module
- import os
- # %%
- # This is to handle the xmnls attribute in the TEI element in the templates
- uri1 = "{http://www.tei-c.org/ns/1.0}"
- namespaces = {
- '': "http://www.tei-c.org/ns/1.0",
- }
- for prefix, uri in namespaces.items():
- ET.register_namespace(prefix, uri)
- # Reference directories
- basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/'
- baseindir = basedir + 'OVI/datiniXML/xmlgat/'
- baseoutdir = basedir + 'OVI/datiniXML/xmlevt/'
- # /Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML
- # %%
- # Import lems list + xml info file
- # get lem list as a json object
- lemfile = '/Users/federicaspinelli/TEAMOVI/Parser/OVI/Lemmi/associazione lemmi - link TLIO/power_lemmarioD_link.json'
- lems = json.load(open(lemfile, 'r'))
- # Get BiblioDatini.xml, extract a list of the <Biblio> nodes with ElementTree
- infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml'
- infotree = ET.parse(infofile)
- inforoot = infotree.getroot()
- infoBiblioNodeList = list(inforoot.iter('Biblio'))
- # %%
- # Get a lem index
- def lemIndex(lem):
- for item in lems:
- if lem.attrib['n'] in item['coordinate']:
- return item['id']
- raise ValueError("code " + lem.attrib['n'] + " not found")
- # Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file
- def getBiblioNodeBySigla(sigla):
- for node in infoBiblioNodeList:
- for child in node:
- if child.tag == 'sigla' and child.text == sigla:
- return node
- def getBiblioNodeByCodice(segnatura):
- for node in infoBiblioNodeList:
- for child in node:
- if child.tag == 'segnatura' and child.text == segnatura:
- return node
- # Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla
- def getLetterRootFromFile(filecode, inputdirectory=baseindir):
- fileName = inputdirectory + 'xmlgat.' + filecode + '.xml'
- try:
- letterRoot = ET.parse(fileName).getroot()
- except ET.ParseError:
- with open(fileName, encoding="ISO-8859-1") as fp:
- xml_string = fp.read()
- xml_string = xml_string.replace(
- '&Csic&c', "<hi rend='italic'>sic</hi>")
- # return xml_string
- letterRoot = ET.fromstring(xml_string)
- return letterRoot
- ##################################
- ##################################
- # in the xmlgat files, the <lem> tag doesn't surround lems:
- #
- # 1. Single-word lems are in the 'tail' of the corr. lem tags, as in:
- # <lem>A_LEM
- # with no closing </lem>
- #
- # 2. Multiple-word lems are inside <w> tags immediately following the <lem>, as in:
- # <lem><w>A MULTIWORD LEM</w>
- # The body of the text is inside a single <div>
- # This functions puts all lems inside a <lem> tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the <w> tag.
- # Basically:
- #
- # <lem>A_LEM --> <lem>A_LEM</lem>
- # <lem><w>A MULTIWORD LEM</w> --> <lem>A MULTIWORD LEM</lem>
- def surroundLems(letterRoot):
- textRoot = list(letterRoot.iter('div'))[0]
- texttags = [node for node in textRoot if node.tag ==
- 'lem' or node.tag == 'w']
- doit = False
- for node in texttags:
- if doit and node.tag == 'w':
- node.tag = 'NEWlem'
- node.attrib = prev_node.attrib
- prev_node.tag = 'OLDlem'
- if node.tag == 'lem' and node.tail != None:
- thelem = re.findall(r'\w+', node.tail)[0] # First word
- node.text = thelem
- node.tail = node.tail.replace(thelem, '')
- doit = False
- else:
- doit = True
- prev_node = node
- for node in textRoot.findall('OLDlem'):
- textRoot.remove(node)
- for node in textRoot.findall('NEWlem'):
- node.tag = 'lem'
- return textRoot
- # This function tries to match a lem inside <lem> node (ElementTree Element object 'node'), by its attribute 'n',
- # the a lem in the lem list, the json object 'lems'
- def getLemByCode(lem):
- for item in lems:
- if lem.attrib['n'] in item['coordinate']:
- return item
- raise ValueError("code " + lem.attrib['n'] + " not found")
- # Dictionary assigning to each OVI lem type a tag useful for the final TEI output
- lemTypeDict = {'s.m.': "sostantivo maschile", 's.f.': 'sostantivo femminile',
- 'antr.': 'antroponimo', 'agg.': 'aggettivo', 'n.g.': 'nome di luogo', 'v.': 'verbo'}
- # This function processes each lem attributes and adds more tags around the lem useful for the final TEI output
- def redefineLems(textRoot, fileCode):
- for node in textRoot.iter('lem'):
- node.attrib['n'] = fileCode + '_' + node.attrib['n']
- thisLem = getLemByCode(node)
- lemRef = '#' + str(thisLem['id'])
- # node.attrib.pop('n')
- lemPos = thisLem['lemma']['categoria']
- lemType = thisLem['lemma']['iperlemma']
- # lemStandard = thisLem['lemma']['forma_standard']
- # lemNote = thisLem['lemma']['note']
- #node.attrib['type'] = lemType
- # if (lemStandard != ''):
- # node.attrib['sameAs'] = lemStandard
- # sub = ET.SubElement(node, 'rdg')
- # sub.text = lemStandard
- # sub.attrib['type'] = 'forma standard'
- if lemPos == 'antr.':
- node.tag = 'persName'
- node.attrib['ref'] = lemRef
- elif lemPos == 'n.g.':
- node.tag = 'placeName'
- node.attrib['ref'] = lemRef
- else:
- node.tag = 'w'
- node.attrib['ref'] = lemRef
- node.attrib['pos'] = lemPos
- node.attrib['type'] = lemType
- # if (lemNote != ''):
- # sub = ET.SubElement(node, 'note')
- # sub.text = lemNote
- #sub.text = node.text
- node.text = node.text
- #node.tag = 'note'
- # for node in textRoot.iter('lem'):
- # node.tag = 'lem'
- # node.attrib['ref'] = lemRef
- # node.attrib['type'] = lemTypeDict[lemType]
- def replacepbcode(textRoot, fileCode):
- for ii, node in enumerate(textRoot.iter('pb')):
- node.attrib['n'] = fileCode + ' c. ' + str(ii + 1)
- node.attrib['xml:id'] = fileCode + '_' + str(ii + 1)
- # filecodeupper = (fileCode + '_' + str(ii + 1)).upper()
- # imgjpg = filecodeupper+'.jpg'
- def replaceimg(textRoot, fileCode):
- folder = "/Users/federicaspinelli/TEAMOVI/evt-angular-ovi/src/assets/data/images/"
- files = os.listdir(folder)
- for ii, node in enumerate(textRoot.iter('pb')):
- filecodeupper = (fileCode + '_' + str(ii + 1)).upper()+'.jpg'
- done = FALSE
- for imgfile in files:
- if imgfile == filecodeupper:
- node.attrib['facs'] = filecodeupper
- done = TRUE
- print("found "+filecodeupper)
- if done != TRUE:
- node.attrib['facs'] = "NO_IMAGE.jpg"
- print("not found "+filecodeupper)
- # if imgfile != filecodeupper:
- # node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg'
- # print("not found "+filecodeupper)
- def surroundPages(textRoot):
- # Create a new, 'clean', root
- newRoot = ET.fromstring("<div/>")
- # Add a <p/> to the new root for each page in the old one
- for node in textRoot.iter('pb'):
- ET.SubElement(newRoot, 'p')
- # Fill the pages in the new root
- page = None
- elementInPage = None
- for child in textRoot:
- if child.tag == 'pb' and page is None:
- page = 0
- elementInPage = 0
- elif child.tag == 'pb':
- page = page+1
- elementInPage = 0
- if page is not None and elementInPage is not None and child.tag != 'milestone':
- newRoot[page].append(child)
- newRoot[page][elementInPage].tail = child.tail
- elementInPage = elementInPage+1
- return newRoot
- # Get the letter template as a string
- def getTemplateString():
- preLetterTemplateTree = ET.parse(
- '/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml')
- where = list(list(preLetterTemplateTree.getroot().iter(
- uri1+'body'))[0].iter(uri1+'div'))[0]
- #
- ET.SubElement(where, 'letterBody')
- #
- letterString = ET.tostring(
- preLetterTemplateTree.getroot(), encoding='unicode', method='xml')
- return letterString
- # Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it
- def newProcessFile(filecode, inputdirectory=baseindir):
- tree1 = getLetterRootFromFile(filecode, inputdirectory)
- # ET.dump(tree1)
- #
- textRoot1 = surroundLems(tree1)
- #
- redefineLems(textRoot1, filecode)
- #
- replacepbcode(textRoot1, filecode)
- replaceimg(textRoot1, filecode)
- #
- textRoot2 = surroundPages(textRoot1)
- #
- indent1 = " "
- textString1 = ET.tostring(textRoot2, encoding='unicode', method='xml')
- textString2 = textString1.replace("<lb />\n", "<lb />\n"+indent1)
- return textString2
- # %%
- letterTemplateString = getTemplateString()
- # %%
- # Example
- filecodeExample = 'z99'
- with open('/Users/federicaspinelli/TEAMOVI/Parser/DATA/test.xml', 'w') as f1:
- newString = letterTemplateString.replace(
- '<letterBody />', newProcessFile(filecodeExample))
- f1.write(newString)
- # %%
- # %%