TEAMOVI
/
Parser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
							# %%
# Imports
from pickle import FALSE, TRUE
import xml.etree.ElementTree as ET
import re
import json
# importing os module
import os
# %%
# This is to handle the xmnls attribute in the TEI element in the templates
uri1 = "{http://www.tei-c.org/ns/1.0}"
namespaces = {
    '':         "http://www.tei-c.org/ns/1.0",
}
for prefix, uri in namespaces.items():
    ET.register_namespace(prefix, uri)
# Reference directories
basedir = '/Users/federicaspinelli/TEAMOVI/Parser/DATA/'
baseindir = basedir + 'OVI/datiniXML/xmlgat/'
baseoutdir = basedir + 'OVI/datiniXML/xmlevt/'
# /Users/federicaspinelli/TEAMOVI/Parser/DATA/OVI/datiniXML
# %%
# Import lems list + xml info file

# get lem list as a json object
lemfile = '/Users/federicaspinelli/TEAMOVI/Parser/OVI/Lemmi/associazione lemmi - link TLIO/power_lemmarioD_link.json'
lems = json.load(open(lemfile, 'r'))

# Get BiblioDatini.xml, extract a list of the <Biblio> nodes with ElementTree
infofile = basedir + 'OVI/datiniXML/BiblioDatini.xml'
infotree = ET.parse(infofile)
inforoot = infotree.getroot()
infoBiblioNodeList = list(inforoot.iter('Biblio'))
# %%
# FUNCTIONS TO PROCESS THE XMLGAT FILEs

# Get a lem index


def lemIndex(lem):
    for item in lems:
        if lem.attrib['n'] in item['coordinate']:
            return item['id']
    raise ValueError("code " + lem.attrib['n'] + " not found")

# Get the ElementTree node ('Element' class object) associated to an OVI sigla in the BiblioDatini.xml file


def getBiblioNodeBySigla(sigla):
    for node in infoBiblioNodeList:
        for child in node:
            if child.tag == 'sigla' and child.text == sigla:
                return node


def getBiblioNodeByCodice(segnatura):
    for node in infoBiblioNodeList:
        for child in node:
            if child.tag == 'segnatura' and child.text == segnatura:
                return node

# Get the ElemenTree object of the whole xmlgat file corresponding to a given OVI sigla


def getLetterRootFromFile(filecode, inputdirectory=baseindir):
    fileName = inputdirectory + 'xmlgat.' + filecode + '.xml'
    try:
        letterRoot = ET.parse(fileName).getroot()
    except ET.ParseError:
        with open(fileName, encoding="ISO-8859-1") as fp:
            xml_string = fp.read()
            xml_string = xml_string.replace(
                '&Csic&c', "<hi rend='italic'>sic</hi>")
#            return xml_string
            letterRoot = ET.fromstring(xml_string)
    return letterRoot


##################################
# ELABORATING LEMS IN XMLGAT FILES
##################################

# PREMISE:
# in the xmlgat files, the <lem> tag doesn't surround lems:
#
# 1. Single-word lems are in the 'tail' of the corr. lem tags, as in:
#  <lem>A_LEM
# with no closing </lem>
#
# 2. Multiple-word lems are inside <w> tags immediately following the <lem>, as in:
#  <lem><w>A MULTIWORD LEM</w>
# The body of the text is inside a single <div>

# This functions puts all lems inside a <lem> tag to make xmlgat files more standard xml-compliant and easier to process, also dropping the <w> tag.
# Basically:
#
# <lem>A_LEM  -->  <lem>A_LEM</lem>
# <lem><w>A MULTIWORD LEM</w>  -->  <lem>A MULTIWORD LEM</lem>
def surroundLems(letterRoot):

    textRoot = list(letterRoot.iter('div'))[0]

    texttags = [node for node in textRoot if node.tag ==
                'lem' or node.tag == 'w']

    doit = False
    for node in texttags:
        if doit and node.tag == 'w':
            node.tag = 'NEWlem'
            node.attrib = prev_node.attrib
            prev_node.tag = 'OLDlem'
        if node.tag == 'lem' and node.tail != None:
            thelem = re.findall(r'\w+', node.tail)[0]  # First word
            node.text = thelem
            node.tail = node.tail.replace(thelem, '')
            doit = False
        else:
            doit = True
        prev_node = node
    for node in textRoot.findall('OLDlem'):
        textRoot.remove(node)
    for node in textRoot.findall('NEWlem'):
        node.tag = 'lem'
    return textRoot


# This function tries to match a lem inside <lem> node (ElementTree Element object 'node'), by its attribute 'n',
# the a lem in the lem list, the json object 'lems'
def getLemByCode(lem):
    for item in lems:
        if lem.attrib['n'] in item['coordinate']:
            return item
    raise ValueError("code " + lem.attrib['n'] + " not found")


# Dictionary assigning to each OVI lem type a tag useful for the final TEI output
lemTypeDict = {'s.m.': "sostantivo maschile", 's.f.': 'sostantivo femminile',
               'antr.': 'antroponimo', 'agg.': 'aggettivo', 'n.g.': 'nome di luogo', 'v.': 'verbo'}


# This function processes each lem attributes and adds more tags around the lem useful for the final TEI output
def redefineLems(textRoot, fileCode):
    for node in textRoot.iter('lem'):
        node.attrib['n'] = fileCode + '_' + node.attrib['n']
        thisLem = getLemByCode(node)
        lemRef = '#' + str(thisLem['id'])
        # node.attrib.pop('n')
        lemPos = thisLem['lemma']['categoria']
        lemType = thisLem['lemma']['iperlemma']
        # lemStandard = thisLem['lemma']['forma_standard']
        # lemNote = thisLem['lemma']['note']
        #node.attrib['type'] = lemType
        # if (lemStandard != ''):
        #     node.attrib['sameAs'] = lemStandard
        #     sub = ET.SubElement(node, 'rdg')
        #     sub.text = lemStandard
        #     sub.attrib['type'] = 'forma standard'

        if lemPos == 'antr.':
            node.tag = 'persName'
            node.attrib['ref'] = lemRef
        elif lemPos == 'n.g.':
            node.tag = 'placeName'
            node.attrib['ref'] = lemRef
        else:
            node.tag = 'w'
            node.attrib['ref'] = lemRef
            node.attrib['pos'] = lemPos
            node.attrib['type'] = lemType
        # if (lemNote != ''):
        #     sub = ET.SubElement(node, 'note')
        #     sub.text = lemNote

        #sub.text = node.text
        node.text = node.text
        #node.tag = 'note'
        # for node in textRoot.iter('lem'):
        #    node.tag = 'lem'
        #    node.attrib['ref'] = lemRef
        #    node.attrib['type'] = lemTypeDict[lemType]


def replacepbcode(textRoot, fileCode):
    for ii, node in enumerate(textRoot.iter('pb')):
        node.attrib['n'] = fileCode + ' c. ' + str(ii + 1)
        node.attrib['xml:id'] = fileCode + '_' + str(ii + 1)
        # filecodeupper = (fileCode + '_' + str(ii + 1)).upper()
        # imgjpg = filecodeupper+'.jpg'
        

# def replaceimg(textRoot, fileCode):
#     folder = "/Volumes/GoogleDrive-117836417327186331381/Il mio Drive/OVI-CNR/images-all/"
#     listfolder = os.listdir(folder)
#     files_dir = [f for f in listfolder if os.path.isdir(os.path.join(folder, f))]  
#     for ii, node in enumerate(textRoot.iter('pb')):
#         pbfile = fileCode
#         print("PB "+pbfile)
#         filecodeupper = (fileCode + '_0' + str(ii + 1)).upper()+'.jpg'
#         done = FALSE
#         for imgfile in files_dir:
#             if imgfile == pbfile:         
#                 node.attrib['facs'] = filecodeupper
#                 done = TRUE
#                 print("found "+filecodeupper)
#             if  imgfile != pbfile:
#                 if imgfile != filecodeupper:
#                     node.attrib['facs'] = "NO_IMAGE.jpg"
#                     node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg'
#                     print("not found "+filecodeupper)

def replaceimg(textRoot, fileCode):
    folder = "/Volumes/GoogleDrive-117836417327186331381/Il mio Drive/OVI-CNR/images-all/"
    files = os.listdir(folder)
    for ii, node in enumerate(textRoot.iter('pb')):
        filecodeupper = (fileCode + '_0' + str(ii + 1)).upper()+'.jpg'
        foldercode = fileCode.upper()
        done = FALSE
        for imgfile in files:
            if imgfile == foldercode:         
                node.attrib['facs'] = foldercode+"/"+filecodeupper
                done = TRUE
                print("found "+ filecodeupper)
                # print("found "+filecodeupper)
        
        if done != TRUE:
            node.attrib['facs'] = "NO_IMAGE/NO_IMAGE.jpg"
            print("not found "+filecodeupper)
                # if imgfile != filecodeupper:
                #     node.attrib['facs'] = 'NO_IMAGE' + '_' + str(ii + 1)+'.jpg'
                #     print("not found "+filecodeupper)

def surroundPages(textRoot):
    # Create a new, 'clean', root
    newRoot = ET.fromstring("<div/>")

    # Add a <p/> to the new root for each page in the old one
    for node in textRoot.iter('pb'):
        ET.SubElement(newRoot, 'p')

    # Fill the pages in the new root
    page = None
    elementInPage = None
    for child in textRoot:
        if child.tag == 'pb' and page is None:
            page = 0
            elementInPage = 0
        elif child.tag == 'pb':
            page = page+1
            elementInPage = 0
        if page is not None and elementInPage is not None and child.tag != 'milestone':
            newRoot[page].append(child)
            newRoot[page][elementInPage].tail = child.tail
            elementInPage = elementInPage+1

    return newRoot


# Get the letter template as a string
def getTemplateString():
    preLetterTemplateTree = ET.parse(
        '/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/pre_letter_template.xml')
    where = list(list(preLetterTemplateTree.getroot().iter(
        uri1+'body'))[0].iter(uri1+'div'))[0]
    #
    ET.SubElement(where, 'letterBody')
    #
    letterString = ET.tostring(
        preLetterTemplateTree.getroot(), encoding='unicode', method='xml')
    return letterString

# Get a file by OVI sigla 'filecode' as an ElementTree object, process its lem, transform it to string and format it


def newProcessFile(filecode, inputdirectory=baseindir):
    tree1 = getLetterRootFromFile(filecode, inputdirectory)
    # ET.dump(tree1)
    #
    textRoot1 = surroundLems(tree1)
    #
    redefineLems(textRoot1, filecode)
    #
    replacepbcode(textRoot1, filecode)
    
    replaceimg(textRoot1, filecode)
    
    #
    textRoot2 = surroundPages(textRoot1)
    #
    indent1 = "               "
    textString1 = ET.tostring(textRoot2, encoding='unicode', method='xml')
    textString2 = textString1.replace("<lb />\n", "<lb />\n"+indent1)
    return textString2

# %%
letterTemplateString = getTemplateString()
# %%
# Example
filecodeExample = 'z99'
with open('/Users/federicaspinelli/TEAMOVI/Parser/OVI/EVT/test.xml', 'w') as f1:
    newString = letterTemplateString.replace(
        '<letterBody />', newProcessFile(filecodeExample))
    f1.write(newString)
# %%

# %%