123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141 |
- import { Injectable } from '@angular/core';
- import { AppConfig } from '../../app.config';
- import { EditionStructure, GenericElement, OriginalEncodingNodeType, Page, XMLElement } from '../../models/evt-models';
- import { createNsResolver, getElementsBetweenTreeNode, isNestedInElem } from '../../utils/dom-utils';
- import { GenericParserService } from './generic-parser.service';
- import { getID, ParseResult } from './parser-models';
- @Injectable({
- providedIn: 'root',
- })
- export class StructureXmlParserService {
- constructor(
- private genericParserService: GenericParserService,
- ) {
- }
- private frontOrigContentAttr = 'document_front';
- readonly frontTagName = 'front';
- readonly pageTagName = 'pb';
- readonly bodyTagName = 'body';
- parsePages(el: XMLElement): EditionStructure {
- if (!el) { return { pages: [] }; }
- const front: XMLElement = el.querySelector(this.frontTagName);
- const body: XMLElement = el.querySelector(this.bodyTagName);
- const pbs = Array.from(el.querySelectorAll(this.pageTagName)).filter((p) => !p.getAttribute('ed'));
- const frontPbs = pbs.filter((p) => isNestedInElem(p, this.frontTagName));
- const bodyPbs = pbs.filter((p) => isNestedInElem(p, this.bodyTagName));
- const doc = el.firstElementChild.ownerDocument;
- if (frontPbs.length > 0 && bodyPbs.length > 0) {
- return {
- pages: pbs.map((pb: XMLElement, idx, arr: XMLElement[]) => this.parseDocumentPage(doc, pb, arr[idx + 1], 'text')),
- };
- }
- const frontPages = frontPbs.length === 0 && front && this.isMarkedAsOrigContent(front)
- ? [this.parseSinglePage(doc, front, 'page_front', this.frontTagName, 'facs_front')]
- : frontPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.frontTagName));
- const bodyPages = bodyPbs.length === 0
- ? [this.parseSinglePage(doc, body, 'page1', 'mainText', 'facs1')] // TODO: tranlsate mainText
- : bodyPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.bodyTagName));
- return {
- pages: [...frontPages, ...bodyPages],
- };
- }
- parseDocumentPage(doc: Document, pb: XMLElement, nextPb: XMLElement, ancestorTagName: string): Page {
- /* If there is a next page we retrieve the elements between two page nodes
- otherweise we retrieve the nodes between the page node and the last node of the body node */
- // TODO: check if querySelectorAll can return an empty array in this case
- const nextNode = nextPb || Array.from(doc.querySelectorAll(ancestorTagName)).reverse()[0].lastChild;
- const originalContent = getElementsBetweenTreeNode(pb, nextNode)
- .filter((n) => n.tagName !== this.pageTagName)
- .filter((c) => ![4, 7, 8].includes(c.nodeType)); // Filter comments, CDATAs, and processing instructions
- return {
- id: getID(pb, 'page'),
- label: pb.getAttribute('n') || 'page',
- facs: (pb.getAttribute('facs') || 'page').split('#').slice(-1)[0],
- originalContent,
- parsedContent: this.parsePageContent(doc, originalContent),
- url: this.getPageUrl(getID(pb, 'page')),
- facsUrl: this.getPageUrl((pb.getAttribute('facs') || 'page').split('#').slice(-1)[0]),
- };
- }
- private parseSinglePage(doc: Document, el: XMLElement, id: string, label: string, facs: string): Page {
- const originalContent: XMLElement[] = getElementsBetweenTreeNode(el.firstChild, el.lastChild);
- return {
- id,
- label,
- facs,
- originalContent,
- parsedContent: this.parsePageContent(doc, originalContent),
- url: this.getPageUrl(id),
- facsUrl: this.getPageUrl(facs),
- };
- }
- private getPageUrl(id) {
- // TODO: check if exists <graphic> element connected to page and return its url
- // TODO: handle multiple version of page
- const image = id.split('.')[0];
- return `${AppConfig.evtSettings.files.imagesFolderUrl}/${image}.jpg`;
- }
- parsePageContent(doc: Document, pageContent: OriginalEncodingNodeType[]): Array<ParseResult<GenericElement>> {
- return pageContent
- .map((node) => {
- const origEl = getEditionOrigNode(node, doc);
- if (origEl.nodeName === this.frontTagName || isNestedInElem(origEl, this.frontTagName)) {
- if (this.hasOriginalContent(origEl)) {
- return Array.from(origEl.querySelectorAll(`[type=${this.frontOrigContentAttr}]`))
- .map((c) => this.genericParserService.parse(c as XMLElement));
- }
- if (this.isMarkedAsOrigContent(origEl)) {
- return [this.genericParserService.parse(origEl)];
- }
- return [] as Array<ParseResult<GenericElement>>;
- }
- if (origEl.tagName === 'text' && origEl.querySelectorAll && origEl.querySelectorAll(this.frontTagName).length > 0) {
- return this.parsePageContent(doc, Array.from(origEl.children) as HTMLElement[]);
- }
- return [this.genericParserService.parse(origEl)];
- })
- .reduce((x, y) => x.concat(y), []);
- }
- hasOriginalContent(el: XMLElement): boolean {
- return el.querySelectorAll(`[type=${this.frontOrigContentAttr}]`).length > 0;
- }
- isMarkedAsOrigContent(el: XMLElement): boolean {
- return el.nodeType !== 3 &&
- (el.getAttribute('type') === this.frontOrigContentAttr ||
- this.hasOriginalContent(el) ||
- isNestedInElem(el, '', [{ key: 'type', value: this.frontOrigContentAttr }])
- );
- }
- }
- function getEditionOrigNode(el: XMLElement, doc: Document) {
- if (el.getAttribute && el.getAttribute('xpath')) {
- const path = doc.documentElement.namespaceURI ? el.getAttribute('xpath').replace(/\//g, '/ns:') : el.getAttribute('xpath');
- const xpathRes = doc.evaluate(path, doc, createNsResolver(doc), XPathResult.ANY_TYPE, undefined);
- return xpathRes.iterateNext() as XMLElement;
- }
- return el;
- }
|