123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- # %%
- # Imports
- import xml.etree.ElementTree as ET
- import re
- import csv
- import json
- # %%
- # Import lems list + info file + authority files
- basedir = '/home/kora/Desktop/OVI_Data/Development/Parser/Data/'
- # lems
- lems = json.load(open(basedir + 'DallOVI/datiniXML/power_lemmarioB.json', 'r'))
- # datini people EAC
- with open(basedir + 'DallASPO/data_eac_datini.csv') as infile:
- reader = csv.DictReader(infile)
- data_eac = [row for row in reader]
- # datini OVI-ASPO data
- with open(basedir + 'FULL_MERGED.csv') as infile:
- reader = csv.DictReader(infile)
- datini_oviaspo = [row for row in reader]
- # %%
- def lemIndex(lem):
- for item in lems:
- if lem.attrib['n'] in item['coordinate']:
- return item['id']
- else:
- raise ValueError()
- # %%
- # Import individual letter files
- # Example file
- filecodeexample = '99b'
- tree1 = ET.parse(basedir + 'DallOVI/datiniXML/xmlgat/' + 'xmlgat.' + filecodeexample + '.xml')
- root1 = tree1.getroot()
- # %%
- # Lems in the xmlgat files have no children;
- # Single-word lems are in the tail of the corr. lem tags;
- # Multiple-word lems are in <w> tags immediately following the <lem>
- # The body of the text is inside a single <div>
- # TRY TO PROCESS THE EXAMPLE FILE
- textbody = list(root1.iter('div'))[0]
- texttags = []
- for node in textbody:
- if(node.tag == 'lem' or node.tag == 'w'):
- texttags.append(node)
- print(len(texttags))
- ET.dump(textbody)
- # %%
- worklist = []
- doit = False
- for node in texttags:
- if doit and node.tag=='w':
- worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
- node.tag = 'NEWlem'
- node.attrib = prev_node.attrib
- prev_node.tag = 'OLDlem'
- if node.tag == 'lem' and node.tail != None:
- thelem = re.findall(r'\w+', node.tail)[0] # First word
- worklist.append(('single-word', node.attrib['n'], thelem))
- node.text = thelem
- node.tail = node.tail.replace(thelem, '')
- doit = False
- else:
- doit = True
- prev_node = node
- for node in textbody.findall('OLDlem'):
- textbody.remove(node)
- for node in textbody.findall('NEWlem'):
- node.tag = 'lem'
- for node in textbody.findall('lem'):
- node.attrib['n'] = filecodeexample + '_' + node.attrib['n']
- ET.dump(textbody)
- # %%
- for node in textbody.findall('lem'):
- node.attrib['ref'] = '#' + str(lemIndex(node))
- node.attrib.pop('n')
- ET.dump(textbody)
- # %%
- for node in textbody.findall('lem'):
- ind = int(node.attrib['ref'][1:])
- if lems[ind]['lemma']['categoria']=='antr.':
- sb = ET.SubElement(node, 'persName')
- sb.text = node.text
- sb.attrib['ref'] = node.attrib['ref']
- node.text = ''
- else:
- if lems[ind]['lemma']['categoria']=='n.g.':
- sb = ET.SubElement(node, 'placeName')
- sb.text = node.text
- sb.attrib['ref'] = node.attrib['ref']
- node.text = ''
- ET.dump(textbody)
- tree1.write(basedir + 'prova.xml')
- # %%
- oviPlaces = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='n.g.', lems ))]
- # %%
- with open(basedir + "ovi_places.json", "w") as outfile:
- json.dump(oviPlaces, outfile, indent=2)
- # %%
- oviNames = [{'id': item['id'], 'lemma': item['lemma']} for item in list(filter( lambda el: el['lemma']['categoria']=='antr.', lems ))]
- # %%
- with open(basedir + "ovi_names.json", "w") as outfile:
- json.dump(oviNames, outfile, indent=2)
- # %%
- print(len(oviPlaces))
- print(len(oviNames))
- # %%
|