# %%
# Imports
import xml.etree.ElementTree as ET
import re
import json
import os
# %%
# Import lems list + xml info file
basedir = '../../Data/'

# lems
lemfile = basedir + 'DallOVI/datiniXML/power_lemmarioB.json'
lems = json.load(open(lemfile, 'r'))

# BiblioDatini.xml

infofile = basedir + 'DallOVI/datiniXML/BiblioDatini.xml'
infotree = ET.parse(infofile)
inforoot = infotree.getroot()
infoBiblioNodeList = list(inforoot.iter('Biblio'))
# %%
print(type(lems))
print(lems[:10])
print('Main nodes in BiblioDatini.xml:', len(infoBiblioNodeList))
# %%
# Utils to extract data from the info files

def lemIndex(lem):
    for item in lems:
        if lem.attrib['n'] in item['coordinate']:
            return item['id']
    raise ValueError("code " + lem.attrib['n'] + " not found")

def getBiblioNodeBySigla(sigla):
    for node in infoBiblioNodeList:
        for child in node:
            if child.tag=='sigla' and child.text==sigla:
                return node
# %%
aa = getBiblioNodeBySigla('A03')
ET.dump(aa)
# %%
# Import individual letter files
indir = basedir + 'DallOVI/datiniXML/xmlgat/'
outdir = basedir + 'DallOVI/datiniXML/xmlevt/'
# %%
# Lems in the xmlgat files have no children;
# Single-word lems are in the tail of the corr. lem tags;
# Multiple-word lems are in <w> tags immediately following the <lem>
# The body of the text is inside a single <div>

# FUNCTION TO PROCESS A FILE

def processFile(indir, filecode):

    tree = ET.parse(indir + 'xmlgat.' +  filecode + '.xml')

    root1 = tree.getroot()

    textbody = list(root1.iter('div'))[0]

    texttags = []
    for node in textbody:
        if(node.tag == 'lem' or node.tag == 'w'):
            texttags.append(node)
    worklist = []
    doit = False
    for node in texttags:
        if doit and node.tag=='w':
            worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
            node.tag = 'NEWlem'
            node.attrib = prev_node.attrib
            prev_node.tag = 'OLDlem'
        if node.tag == 'lem' and node.tail != None:
            thelem = re.findall(r'\w+', node.tail)[0] # First word
            worklist.append(('single-word', node.attrib['n'], thelem))
            node.text = thelem
            node.tail = node.tail.replace(thelem, '')
            doit = False
        else:
            doit = True
        prev_node = node

    for node in textbody.findall('OLDlem'):
        textbody.remove(node)
    for node in textbody.findall('NEWlem'):
        node.tag = 'lem'
    for node in textbody.findall('lem'):
        node.attrib['n'] = filecode + '_' + node.attrib['n']

    for node in textbody.findall('lem'):
        node.attrib['ref'] = '#' + str(lemIndex(node))
        node.attrib.pop('n')

    for node in textbody.findall('lem'):
        ind = int(node.attrib['ref'][1:])
        if lems[ind]['lemma']['categoria']=='antr.':
            sb = ET.SubElement(node, 'persName')
            sb.text = node.text
            sb.attrib['ref'] = node.attrib['ref']
            node.text = ''
        else:
            if lems[ind]['lemma']['categoria']=='n.g.':
                sb = ET.SubElement(node, 'placeName')
                sb.text = node.text
                sb.attrib['ref'] = node.attrib['ref']
                node.text = ''
    
    return tree
#%%
# Example file
filecodeexample = '99b'

tree1 = processFile(indir, filecodeexample)
tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
# %%
# Another example file
filecodeexample = '80c'
tree1 = processFile(indir, filecodeexample)
tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
# %%
for file in os.listdir(indir):
    try:
        local_filecode = file.replace('xmlgat.', '').replace('.xml', '')
        local_tree = processFile(indir, local_filecode)
        local_tree.write(outdir + 'xmlevt-' + local_filecode + '.xml')
    except ET.ParseError:
        print("ParseError - " + file)
    except KeyError:
        print("KeyError - " + file)
    except IndexError:
        print("IndexError - " + file)
print('DONE!')
# %%
filecodeexample = 'j91'

tree2 = ET.parse(indir + 'xmlgat.' +  filecodeexample + '.xml')
# %%
ET.dump(tree2)
# %%
tree3 = processFile(indir, filecodeexample)
# %%
ET.dump(tree3)
# %%
indir + 'xmlgat.' +  filecodeexample + '.xml'
# %%
tempdir = "/home/kora/Desktop/FREELANCE_LOCAL/"
# %%
tree3.write(tempdir + 'xmlevt-' + filecodeexample + '.xml')
# %%