TEAMOVI
/
Parser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							# %%
# Imports
import xml.etree.ElementTree as ET
import re
import json
import os
# %%
# Import lems list + info file + authority files
basedir = '/home/kora/Desktop/OVI_Data/Development/Parser/Data/'

# lems
lems = json.load(open(basedir + 'DallOVI/datiniXML/power_lemmarioB.json', 'r'))
# %%
def lemIndex(lem):
    for item in lems:
        if lem.attrib['n'] in item['coordinate']:
            return item['id']
    raise ValueError("code " + lem.attrib['n'] + " not found")
# %%
# Import individual letter files
indir = basedir + 'DallOVI/datiniXML/xmlgat/'
outdir = basedir + 'DallOVI/datiniXML/xmlevt/'
# %%
# Lems in the xmlgat files have no children;
# Single-word lems are in the tail of the corr. lem tags;
# Multiple-word lems are in <w> tags immediately following the <lem>
# The body of the text is inside a single <div>

# FUNCTION TO PROCESS A FILE

def processFile(indir, filecode):

    tree = ET.parse(indir + 'xmlgat.' +  filecode + '.xml')

    root1 = tree.getroot()

    textbody = list(root1.iter('div'))[0]

    texttags = []
    for node in textbody:
        if(node.tag == 'lem' or node.tag == 'w'):
            texttags.append(node)
    worklist = []
    doit = False
    for node in texttags:
        if doit and node.tag=='w':
            worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
            node.tag = 'NEWlem'
            node.attrib = prev_node.attrib
            prev_node.tag = 'OLDlem'
        if node.tag == 'lem' and node.tail != None:
            thelem = re.findall(r'\w+', node.tail)[0] # First word
            worklist.append(('single-word', node.attrib['n'], thelem))
            node.text = thelem
            node.tail = node.tail.replace(thelem, '')
            doit = False
        else:
            doit = True
        prev_node = node

    for node in textbody.findall('OLDlem'):
        textbody.remove(node)
    for node in textbody.findall('NEWlem'):
        node.tag = 'lem'
    for node in textbody.findall('lem'):
        node.attrib['n'] = filecode + '_' + node.attrib['n']

    for node in textbody.findall('lem'):
        node.attrib['ref'] = '#' + str(lemIndex(node))
        node.attrib.pop('n')

    for node in textbody.findall('lem'):
        ind = int(node.attrib['ref'][1:])
        if lems[ind]['lemma']['categoria']=='antr.':
            sb = ET.SubElement(node, 'persName')
            sb.text = node.text
            sb.attrib['ref'] = node.attrib['ref']
            node.text = ''
        else:
            if lems[ind]['lemma']['categoria']=='n.g.':
                sb = ET.SubElement(node, 'placeName')
                sb.text = node.text
                sb.attrib['ref'] = node.attrib['ref']
                node.text = ''
    
    return tree
#%%
# Example file
filecodeexample = '99b'

tree1 = processFile(indir, filecodeexample)
tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
# %%
# Another example file
filecodeexample = '80c'
tree1 = processFile(indir, filecodeexample)
tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
# %%
for file in os.listdir(indir):
    try:
        local_filecode = file.replace('xmlgat.', '').replace('.xml', '')
        local_tree = processFile(indir, local_filecode)
        local_tree.write(outdir + 'xmlevt-' + local_filecode + '.xml')
    except ET.ParseError:
        print("ParseError - " + file)
    except KeyError:
        print("KeyError - " + file)
    except IndexError:
        print("IndexError - " + file)
print('DONE!')
# %%
filecodeexample = 'j92'

tree2 = ET.parse(indir + 'xmlgat.' +  filecodeexample + '.xml')


# %%