123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117 |
- # %%
- # Imports
- import xml.etree.ElementTree as ET
- import re
- import json
- import os
- # %%
- # Import lems list + info file + authority files
- basedir = '/home/kora/Desktop/OVI_Data/Development/Parser/Data/'
- # lems
- lems = json.load(open(basedir + 'DallOVI/datiniXML/power_lemmarioB.json', 'r'))
- # %%
- def lemIndex(lem):
- for item in lems:
- if lem.attrib['n'] in item['coordinate']:
- return item['id']
- raise ValueError("code " + lem.attrib['n'] + " not found")
- # %%
- # Import individual letter files
- indir = basedir + 'DallOVI/datiniXML/xmlgat/'
- outdir = basedir + 'DallOVI/datiniXML/xmlevt/'
- # %%
- # Lems in the xmlgat files have no children;
- # Single-word lems are in the tail of the corr. lem tags;
- # Multiple-word lems are in <w> tags immediately following the <lem>
- # The body of the text is inside a single <div>
- # FUNCTION TO PROCESS A FILE
- def processFile(indir, filecode):
- tree = ET.parse(indir + 'xmlgat.' + filecode + '.xml')
- root1 = tree.getroot()
- textbody = list(root1.iter('div'))[0]
- texttags = []
- for node in textbody:
- if(node.tag == 'lem' or node.tag == 'w'):
- texttags.append(node)
- worklist = []
- doit = False
- for node in texttags:
- if doit and node.tag=='w':
- worklist.append(('multiple-word', prev_node.attrib['n'], node.text))
- node.tag = 'NEWlem'
- node.attrib = prev_node.attrib
- prev_node.tag = 'OLDlem'
- if node.tag == 'lem' and node.tail != None:
- thelem = re.findall(r'\w+', node.tail)[0] # First word
- worklist.append(('single-word', node.attrib['n'], thelem))
- node.text = thelem
- node.tail = node.tail.replace(thelem, '')
- doit = False
- else:
- doit = True
- prev_node = node
- for node in textbody.findall('OLDlem'):
- textbody.remove(node)
- for node in textbody.findall('NEWlem'):
- node.tag = 'lem'
- for node in textbody.findall('lem'):
- node.attrib['n'] = filecode + '_' + node.attrib['n']
- for node in textbody.findall('lem'):
- node.attrib['ref'] = '#' + str(lemIndex(node))
- node.attrib.pop('n')
- for node in textbody.findall('lem'):
- ind = int(node.attrib['ref'][1:])
- if lems[ind]['lemma']['categoria']=='antr.':
- sb = ET.SubElement(node, 'persName')
- sb.text = node.text
- sb.attrib['ref'] = node.attrib['ref']
- node.text = ''
- else:
- if lems[ind]['lemma']['categoria']=='n.g.':
- sb = ET.SubElement(node, 'placeName')
- sb.text = node.text
- sb.attrib['ref'] = node.attrib['ref']
- node.text = ''
-
- return tree
- #%%
- # Example file
- filecodeexample = '99b'
- tree1 = processFile(indir, filecodeexample)
- tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
- # %%
- # Another example file
- filecodeexample = '80c'
- tree1 = processFile(indir, filecodeexample)
- tree1.write(outdir + 'xmlevt-' + filecodeexample + '.xml')
- # %%
- for file in os.listdir(indir):
- try:
- local_filecode = file.replace('xmlgat.', '').replace('.xml', '')
- local_tree = processFile(indir, local_filecode)
- local_tree.write(outdir + 'xmlevt-' + local_filecode + '.xml')
- except ET.ParseError:
- print("ParseError - " + file)
- except KeyError:
- print("KeyError - " + file)
- except IndexError:
- print("IndexError - " + file)
- print('DONE!')
- # %%
- filecodeexample = 'j92'
- tree2 = ET.parse(indir + 'xmlgat.' + filecodeexample + '.xml')
- # %%
|