structure-xml-parser.service.ts 5.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141
  1. import { Injectable } from '@angular/core';
  2. import { AppConfig } from '../../app.config';
  3. import { EditionStructure, GenericElement, OriginalEncodingNodeType, Page, XMLElement } from '../../models/evt-models';
  4. import { createNsResolver, getElementsBetweenTreeNode, isNestedInElem } from '../../utils/dom-utils';
  5. import { GenericParserService } from './generic-parser.service';
  6. import { getID, ParseResult } from './parser-models';
  7. @Injectable({
  8. providedIn: 'root',
  9. })
  10. export class StructureXmlParserService {
  11. constructor(
  12. private genericParserService: GenericParserService,
  13. ) {
  14. }
  15. private frontOrigContentAttr = 'document_front';
  16. readonly frontTagName = 'front';
  17. readonly pageTagName = 'pb';
  18. readonly bodyTagName = 'body';
  19. parsePages(el: XMLElement): EditionStructure {
  20. if (!el) { return { pages: [] }; }
  21. const front: XMLElement = el.querySelector(this.frontTagName);
  22. const body: XMLElement = el.querySelector(this.bodyTagName);
  23. const pbs = Array.from(el.querySelectorAll(this.pageTagName)).filter((p) => !p.getAttribute('ed'));
  24. const frontPbs = pbs.filter((p) => isNestedInElem(p, this.frontTagName));
  25. const bodyPbs = pbs.filter((p) => isNestedInElem(p, this.bodyTagName));
  26. const doc = el.firstElementChild.ownerDocument;
  27. if (frontPbs.length > 0 && bodyPbs.length > 0) {
  28. return {
  29. pages: pbs.map((pb: XMLElement, idx, arr: XMLElement[]) => this.parseDocumentPage(doc, pb, arr[idx + 1], 'text')),
  30. };
  31. }
  32. const frontPages = frontPbs.length === 0 && front && this.isMarkedAsOrigContent(front)
  33. ? [this.parseSinglePage(doc, front, 'page_front', this.frontTagName, 'facs_front')]
  34. : frontPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.frontTagName));
  35. const bodyPages = bodyPbs.length === 0
  36. ? [this.parseSinglePage(doc, body, 'page1', 'mainText', 'facs1')] // TODO: tranlsate mainText
  37. : bodyPbs.map((pb, idx, arr) => this.parseDocumentPage(doc, pb as HTMLElement, arr[idx + 1] as HTMLElement, this.bodyTagName));
  38. return {
  39. pages: [...frontPages, ...bodyPages],
  40. };
  41. }
  42. parseDocumentPage(doc: Document, pb: XMLElement, nextPb: XMLElement, ancestorTagName: string): Page {
  43. /* If there is a next page we retrieve the elements between two page nodes
  44. otherweise we retrieve the nodes between the page node and the last node of the body node */
  45. // TODO: check if querySelectorAll can return an empty array in this case
  46. const nextNode = nextPb || Array.from(doc.querySelectorAll(ancestorTagName)).reverse()[0].lastChild;
  47. const originalContent = getElementsBetweenTreeNode(pb, nextNode)
  48. .filter((n) => n.tagName !== this.pageTagName)
  49. .filter((c) => ![4, 7, 8].includes(c.nodeType)); // Filter comments, CDATAs, and processing instructions
  50. return {
  51. id: getID(pb, 'page'),
  52. label: pb.getAttribute('n') || 'page',
  53. facs: (pb.getAttribute('facs') || 'page').split('#').slice(-1)[0],
  54. originalContent,
  55. parsedContent: this.parsePageContent(doc, originalContent),
  56. url: this.getPageUrl(getID(pb, 'page')),
  57. facsUrl: this.getPageUrl((pb.getAttribute('facs') || 'page').split('#').slice(-1)[0]),
  58. };
  59. }
  60. private parseSinglePage(doc: Document, el: XMLElement, id: string, label: string, facs: string): Page {
  61. const originalContent: XMLElement[] = getElementsBetweenTreeNode(el.firstChild, el.lastChild);
  62. return {
  63. id,
  64. label,
  65. facs,
  66. originalContent,
  67. parsedContent: this.parsePageContent(doc, originalContent),
  68. url: this.getPageUrl(id),
  69. facsUrl: this.getPageUrl(facs),
  70. };
  71. }
  72. private getPageUrl(id) {
  73. // TODO: check if exists <graphic> element connected to page and return its url
  74. // TODO: handle multiple version of page
  75. const image = id.split('.')[0];
  76. return `${AppConfig.evtSettings.files.imagesFolderUrl}/${image}.jpg`;
  77. }
  78. parsePageContent(doc: Document, pageContent: OriginalEncodingNodeType[]): Array<ParseResult<GenericElement>> {
  79. return pageContent
  80. .map((node) => {
  81. const origEl = getEditionOrigNode(node, doc);
  82. if (origEl.nodeName === this.frontTagName || isNestedInElem(origEl, this.frontTagName)) {
  83. if (this.hasOriginalContent(origEl)) {
  84. return Array.from(origEl.querySelectorAll(`[type=${this.frontOrigContentAttr}]`))
  85. .map((c) => this.genericParserService.parse(c as XMLElement));
  86. }
  87. if (this.isMarkedAsOrigContent(origEl)) {
  88. return [this.genericParserService.parse(origEl)];
  89. }
  90. return [] as Array<ParseResult<GenericElement>>;
  91. }
  92. if (origEl.tagName === 'text' && origEl.querySelectorAll && origEl.querySelectorAll(this.frontTagName).length > 0) {
  93. return this.parsePageContent(doc, Array.from(origEl.children) as HTMLElement[]);
  94. }
  95. return [this.genericParserService.parse(origEl)];
  96. })
  97. .reduce((x, y) => x.concat(y), []);
  98. }
  99. hasOriginalContent(el: XMLElement): boolean {
  100. return el.querySelectorAll(`[type=${this.frontOrigContentAttr}]`).length > 0;
  101. }
  102. isMarkedAsOrigContent(el: XMLElement): boolean {
  103. return el.nodeType !== 3 &&
  104. (el.getAttribute('type') === this.frontOrigContentAttr ||
  105. this.hasOriginalContent(el) ||
  106. isNestedInElem(el, '', [{ key: 'type', value: this.frontOrigContentAttr }])
  107. );
  108. }
  109. }
  110. function getEditionOrigNode(el: XMLElement, doc: Document) {
  111. if (el.getAttribute && el.getAttribute('xpath')) {
  112. const path = doc.documentElement.namespaceURI ? el.getAttribute('xpath').replace(/\//g, '/ns:') : el.getAttribute('xpath');
  113. const xpathRes = doc.evaluate(path, doc, createNsResolver(doc), XPathResult.ANY_TYPE, undefined);
  114. return xpathRes.iterateNext() as XMLElement;
  115. }
  116. return el;
  117. }