TEAMOVI
/
Parser


			
				
					
						
						
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117
							# %%
# Imports
import xml.etree.ElementTree as ET
import re
import csv
import json
# %%
# Import lems list + info file + authority files
basedir = '/home/kora/Desktop/OVI_Data/Development/Parser/Data/'

# lems
lems = json.load(open(basedir + 'DallOVI/datiniXML/power_lemmarioB.json', 'r'))

# datini people EAC
with open(basedir + 'DallASPO/data_eac_datini.csv') as infile:
    reader = csv.DictReader(infile)
    data_eac = [row for row in reader]

# datini OVI-ASPO data
with open(basedir + 'FULL_MERGED.csv') as infile:
    reader = csv.DictReader(infile)
    datini_oviaspo = [row for row in reader]
# %%
def lemIndex(lem):
    for item in lems:
        if lem.attrib['n'] in item['coordinate']:
            return item['id']
    else:
        raise ValueError()
# %%
# Import individual letter files

# Example file
filecodeexample = '99b'
tree1 = ET.parse(basedir + 'DallOVI/datiniXML/xmlgat/' + 'xmlgat.' + filecodeexample + '.xml')
root1 = tree1.getroot()
# %%
# Lems in the xmlgat files have no children;
# Single-word lems are in the tail of the corr. lem tags;
# Multiple-word lems are in <w> tags immediately following the <lem>
# The body of the text is inside a single <div>

# TRY TO PROCESS THE EXAMPLE FILE

textbody = list(root1.iter('div'))[0]

texttags = []
for node in textbody:
    if(node.tag == 'lem' or node.tag == 'w'):
        texttags.append(node)

print(len(texttags))
ET.dump(textbody)
# %%
worklist = []
doit = False
for node in texttags:
    if doit and node.tag=='w':
        worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
        node.tag = 'NEWlem'
        node.attrib = prev_node.attrib
        prev_node.tag = 'OLDlem'
    if node.tag == 'lem' and node.tail != None:
        thelem = re.findall(r'\w+', node.tail)[0] # First word
        worklist.append(('single-word', node.attrib['n'], thelem))
        node.text = thelem
        node.tail = node.tail.replace(thelem, '')
        doit = False
    else:
        doit = True
    prev_node = node

for node in textbody.findall('OLDlem'):
    textbody.remove(node)
for node in textbody.findall('NEWlem'):
    node.tag = 'lem'
for node in textbody.findall('lem'):
    node.attrib['n'] = filecodeexample + '_' + node.attrib['n']

ET.dump(textbody)
# %%
for node in textbody.findall('lem'):
    node.attrib['ref'] = '#' + str(lemIndex(node))
    node.attrib.pop('n')

ET.dump(textbody)
# %%
for node in textbody.findall('lem'):
    ind = int(node.attrib['ref'][1:])
    if lems[ind]['lemma']['categoria']=='antr.':
        sb = ET.SubElement(node, 'persName')
        sb.text = node.text
        sb.attrib['ref'] = node.attrib['ref']
        node.text = ''
    else:
        if lems[ind]['lemma']['categoria']=='n.g.':
            sb = ET.SubElement(node, 'placeName')
            sb.text = node.text
            sb.attrib['ref'] = node.attrib['ref']
            node.text = ''

ET.dump(textbody)
tree1.write(basedir + 'prova.xml')
# %%
oviPlaces = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='n.g.', lems ))]
# %%
with open(basedir + "ovi_places.json", "w") as outfile:
    json.dump(oviPlaces, outfile, indent=2)
# %%
oviNames = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='antr.', lems ))]
# %%
with open(basedir + "ovi_names.json", "w") as outfile:
    json.dump(oviNames, outfile, indent=2)
# %%
print(len(oviPlaces))
print(len(oviNames))
# %%