TEAMOVI
/
Parser


			
							123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
							## IMPORTS

# Utilities to read/write csv files
import csv, json
from operator import truediv

# Custom class to store URIs + related infos for the ontologies/repositories

class RDFcoords:
    def __init__(self, uri, prefix, code = None):
        self.uri = uri
        self.prefix = prefix
        self.code = code


# Repositories
museoCoords = RDFcoords('<https://palazzopretorio.prato.it/it/le-opere/alcuni-capolavori/>', 'mpp:')
autCoords = RDFcoords('<https://palazzopretorio.prato.it/it/opere/autori/>', 'aut:')
foafCoords = RDFcoords('<http://xmlns.com/foaf/0.1/>', 'foaf:')

cidocCoords = RDFcoords('<http://www.cidoc-crm.org/cidoc-crm/>', 'crm:')
aatCoords = RDFcoords('<http://vocab.getty.edu/aat/>', 'aat:')
nsCoords = RDFcoords('<http://www.w3.org/1999/02/22-rdf-syntax-ns#>', 'rdf:')
schemaCoords = RDFcoords('<http://www.schema.org/>', 'schema:')
rdfsCoords = RDFcoords('<http://www.w3.org/2000/01/rdf-schema#>', 'rdfs:')


# Basic utilities to format triples / shortened triples in TTL format
#
# Format full triple
def triple(subject, predicate, object1):
    line = subject + ' ' + predicate + ' ' + object1
    return line

# Format entry in predicate list (no subject)
def doublet(predicate, object1):
    line = '    ' + predicate + ' ' + object1
    return line

# Format entry in object list (object only)
def singlet(object1):
    line = '        ' + object1
    return line

# Line endings
continueLine1 = ' ;\n' # Before a predicate list, that is if the FOLLOWING triple has the same subject
continueLine2 = ' ,\n' # Before an object list, that is if the FOLLOWING triple has the same subject and predicate
closeLine = ' .\n' # To end a triple / a triples block


def writeTTLHeader(output):
    output.write('@prefix ' + museoCoords.prefix + ' ' + museoCoords.uri + closeLine)
    output.write('@prefix ' + foafCoords.prefix + ' ' + foafCoords.uri + closeLine)
    output.write('@prefix ' + autCoords.prefix + ' ' + autCoords.uri + closeLine)
    output.write('@prefix ' + cidocCoords.prefix + ' ' + cidocCoords.uri + closeLine)
    output.write('@prefix ' + aatCoords.prefix + ' ' + aatCoords.uri + closeLine)
    output.write('@prefix ' + schemaCoords.prefix + ' ' + schemaCoords.uri + closeLine)
    output.write('@prefix ' + nsCoords.prefix + ' ' + nsCoords.uri + closeLine)
    output.write('@prefix ' + rdfsCoords.prefix + ' ' + rdfsCoords.uri + closeLine)
    output.write('\n')

max_entries = None

def parsefromfile(mapfilename, infile, outfilename):
    inputFile = infile.decode()
    csv_dicts = [{k: v for k, v in row.items()} for row in csv.DictReader(inputFile.splitlines(), skipinitialspace=True)]
    with open (mapfilename) as mapfile:
        json_dicts = json.load(mapfile)
        parse(json_dicts, csv_dicts, outfilename)


def parse(json_dicts, csv_dicts, outfilename):
    with open(outfilename, 'w') as outputfile:
        writeTTLHeader(outputfile)
        first = True # In case something needs processing only once for the whole CSV input
        for ii, csvrow in enumerate(csv_dicts):
            # Skip the first line as it carries info we don't want to triplify
            if(first):
                first = False
                continue
            # The index ii is mainly used to limit the number of entries to process, for testing purposes
            for jj, node in enumerate(json_dicts):
                if type(node["colonna"]) is list:
                        csvvalue = [csvrow[col] for col in node["colonna"]]
                else:
                        csvvalue = csvrow[node["colonna"]]

                if checkEmptyValue(csvvalue):
                    continue

                line = triple(settripleuri(csvvalue, node["uri"]), nsCoords.prefix + 'type', node["tipo"]) + closeLine
                outputfile.write(line)
                if node["sottoelementodi"] != '':
                    parent = next (filter(lambda el: el["identificatore"]==node["sottoelementodi"], json_dicts), None)
                    if parent is not None:
                        if type(parent["colonna"]) is list:
                            parent_csvvalue = [csvrow[col] for col in parent["colonna"]]
                        else:
                            parent_csvvalue = csvrow[parent["colonna"]]
                        subject = settripleuri(parent_csvvalue, parent["uri"])
                        property = node["relazione"]
                        object = settripleuri(csvvalue, node["uri"])
                        line = triple(subject, property,
                        object) + closeLine
                        outputfile.write(line)

                outputfile.write('\n')
            #
            #
            # To limit number of entries processed (if desired for testing purposes)
            if (max_entries is not None and ii > max_entries): 
                break

def settripleuri (csvvalue, nodeuri):
    output = "\""+nodeuri+"\""
    if type(csvvalue) is list:
        for ii, value in enumerate(csvvalue):
            if value=='':
                output = output.replace('$VALORE_CSV_'+ str(ii)+'$', 'N/A')
            else:
                output = output.replace('$VALORE_CSV_'+ str(ii)+'$', value)

    else:
        output = output.replace('$VALORE_CSV$', csvvalue)

    return output


def checkEmptyValue(csvvalue):
    if type(csvvalue) is list:
        emptyList = ['' for el in csvvalue]
        if emptyList==csvvalue:
            return True
    if csvvalue=='':
        return True
    return False