format.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135
  1. import json
  2. import polars as pl
  3. # Text highlighting
  4. def formatAllContexts(bibliocontexts: pl.DataFrame) -> pl.DataFrame:
  5. index = 0
  6. for col in bibliocontexts.columns:
  7. if col.startswith('pitxt'):
  8. if index == 0:
  9. bibliocontexts = bibliocontexts.with_column(
  10. pl.struct(['formattazione contesto', 'pitxt', 'piniz', 'elemlen']).apply(
  11. lambda x: addHighlightToFormatting(x['formattazione contesto'],
  12. int(x['pitxt'] - x['piniz']),
  13. int(x['elemlen']))
  14. ).alias('formattazione contesto')
  15. )
  16. else:
  17. bibliocontexts = bibliocontexts.with_column(
  18. pl.struct([f'formattazione contesto', f'pitxt_{index}', 'piniz',
  19. f'elemlen_{index}']).apply(
  20. lambda x: addHighlightToFormatting(x['formattazione contesto'],
  21. int(x[f'pitxt_{index}'] - x['piniz']),
  22. int(x[f'elemlen_{index}']))
  23. ).alias('formattazione contesto')
  24. )
  25. index += 1
  26. bibliocontexts = bibliocontexts.with_column(
  27. pl.struct(['contesto', 'formattazione contesto']).apply(
  28. lambda x: formatContext(x['contesto'], x['formattazione contesto'])
  29. ).alias('contesto formattato')
  30. )
  31. return bibliocontexts
  32. # Define formatting information for a single context
  33. # Takes as input a non-formatted string + character-by-character format information
  34. def formatContext(context, formatCodesJson, strict=True):
  35. formatCodes = json.loads(formatCodesJson)
  36. if strict and len(formatCodes)!=len(context):
  37. raise ValueError("Formatting information does not match context")
  38. # Get formatting code CHANGES + first code
  39. formatChanges = [(0, formatCodes[0])] + [(ind, formatCodes[ind]) for ind in range(1, len(formatCodes)) if formatCodes[ind]!=formatCodes[ind-1]]
  40. # Define a dict containing string parts + format information for each part
  41. formattedContext = []
  42. for ind, form in enumerate(formatChanges):
  43. format0 = []
  44. if form[1]>0:
  45. format0 = bitToFormat(getBits(form[1]))
  46. start = form[0]
  47. end = formatChanges[ind+1][0] if ind < len(formatChanges)-1 else -1
  48. try:
  49. stringPart = context[start:end]
  50. formattedContext.append( {'formatting': format0, 'stringPart': stringPart} )
  51. except:
  52. pass
  53. return json.dumps(formattedContext)
  54. # For the sake of completeness, get an array of formatting codes back out of a formatted context
  55. def getFormattingCodes(formattedContext):
  56. toRet = []
  57. for contextPart in formattedContext:
  58. stringPart = contextPart['stringPart']
  59. code = getSingleFormattingCode(contextPart['formatting'])
  60. toRet = toRet + [code]*len(stringPart)
  61. return toRet
  62. #
  63. def getSingleFormattingCode(formatting):
  64. toRet = 0
  65. if 'grassetto' in formatting:
  66. toRet += 1
  67. if 'corsivo' in formatting:
  68. toRet += 2
  69. if 'sottolineato' in formatting:
  70. toRet += 4
  71. if 'barrato' in formatting:
  72. toRet += 8
  73. if 'evidenziato' in formatting:
  74. toRet += 16
  75. return toRet
  76. # Utility: modify a single format codes string to add highlighting
  77. def addHighlightToFormatting(formatCodesJson, highlightStart, highlightLength):
  78. formatCodes = json.loads(formatCodesJson)
  79. for index in range(highlightLength):
  80. try:
  81. formatCodes[highlightStart + index] += 16
  82. except IndexError:
  83. pass
  84. return json.dumps(formatCodes)
  85. # Formatting decoders -- from numeric code to explicit formatting
  86. # Legend:
  87. '''
  88. Formatting is set by (currently) five bits of information, codified together
  89. as an int:
  90. 0 (l.s.b.) = no formatting
  91. 1 = grassetto / bold
  92. 2 = corsivo / italic
  93. 4 = sottolineato / underline
  94. 8 = barrato / strikethroguh
  95. 16 (m.s.b.) = highlight
  96. Clearly, codes are ADDITIVE: for instance 28 = 16 + 8 + 4 stands for
  97. underlined, striked-through and highlighted text
  98. '''
  99. def getBits(num):
  100. numOfBits = 5
  101. aa = bin(num) # Bin converts the input to a bit string (with prefix \b)
  102. return aa[2:].rjust(numOfBits, "0")
  103. def bitToFormat(bitString):
  104. format0 = []
  105. if bitString[-1]=="1":
  106. format0.append('grassetto')
  107. if bitString[-2]=="1":
  108. format0.append('corsivo')
  109. if bitString[-3]=="1":
  110. format0.append('sottolineato')
  111. if bitString[-4]=="1":
  112. format0.append('barrato')
  113. if bitString[-5]=="1":
  114. format0.append('evidenziato')
  115. return format0