123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- import json
- import polars as pl
- # Text highlighting
- def formatAllContexts(bibliocontexts: pl.DataFrame) -> pl.DataFrame:
- index = 0
- for col in bibliocontexts.columns:
- if col.startswith('pitxt'):
- if index == 0:
- bibliocontexts = bibliocontexts.with_column(
- pl.struct(['formattazione contesto', 'pitxt', 'piniz', 'elemlen']).apply(
- lambda x: addHighlightToFormatting(x['formattazione contesto'],
- int(x['pitxt'] - x['piniz']),
- int(x['elemlen']))
- ).alias('formattazione contesto')
- )
- else:
- bibliocontexts = bibliocontexts.with_column(
- pl.struct([f'formattazione contesto', f'pitxt_{index}', 'piniz',
- f'elemlen_{index}']).apply(
- lambda x: addHighlightToFormatting(x['formattazione contesto'],
- int(x[f'pitxt_{index}'] - x['piniz']),
- int(x[f'elemlen_{index}']))
- ).alias('formattazione contesto')
- )
- index += 1
- bibliocontexts = bibliocontexts.with_column(
- pl.struct(['contesto', 'formattazione contesto']).apply(
- lambda x: formatContext(x['contesto'], x['formattazione contesto'])
- ).alias('contesto formattato')
- )
- return bibliocontexts
- # Define formatting information for a single context
- # Takes as input a non-formatted string + character-by-character format information
- def formatContext(context, formatCodesJson, strict=True):
- formatCodes = json.loads(formatCodesJson)
- if strict and len(formatCodes)!=len(context):
- raise ValueError("Formatting information does not match context")
-
- # Get formatting code CHANGES + first code
- formatChanges = [(0, formatCodes[0])] + [(ind, formatCodes[ind]) for ind in range(1, len(formatCodes)) if formatCodes[ind]!=formatCodes[ind-1]]
- # Define a dict containing string parts + format information for each part
- formattedContext = []
- for ind, form in enumerate(formatChanges):
- format0 = []
- if form[1]>0:
- format0 = bitToFormat(getBits(form[1]))
- start = form[0]
- end = formatChanges[ind+1][0] if ind < len(formatChanges)-1 else -1
- try:
- stringPart = context[start:end]
- formattedContext.append( {'formatting': format0, 'stringPart': stringPart} )
- except:
- pass
- return json.dumps(formattedContext)
- # For the sake of completeness, get an array of formatting codes back out of a formatted context
- def getFormattingCodes(formattedContext):
- toRet = []
- for contextPart in formattedContext:
- stringPart = contextPart['stringPart']
- code = getSingleFormattingCode(contextPart['formatting'])
- toRet = toRet + [code]*len(stringPart)
- return toRet
- #
- def getSingleFormattingCode(formatting):
- toRet = 0
- if 'grassetto' in formatting:
- toRet += 1
- if 'corsivo' in formatting:
- toRet += 2
- if 'sottolineato' in formatting:
- toRet += 4
- if 'barrato' in formatting:
- toRet += 8
- if 'evidenziato' in formatting:
- toRet += 16
- return toRet
- # Utility: modify a single format codes string to add highlighting
- def addHighlightToFormatting(formatCodesJson, highlightStart, highlightLength):
- formatCodes = json.loads(formatCodesJson)
- for index in range(highlightLength):
- try:
- formatCodes[highlightStart + index] += 16
- except IndexError:
- pass
- return json.dumps(formatCodes)
- # Formatting decoders -- from numeric code to explicit formatting
- # Legend:
- '''
- Formatting is set by (currently) five bits of information, codified together
- as an int:
- 0 (l.s.b.) = no formatting
- 1 = grassetto / bold
- 2 = corsivo / italic
- 4 = sottolineato / underline
- 8 = barrato / strikethroguh
- 16 (m.s.b.) = highlight
- Clearly, codes are ADDITIVE: for instance 28 = 16 + 8 + 4 stands for
- underlined, striked-through and highlighted text
- '''
- def getBits(num):
- numOfBits = 5
- aa = bin(num) # Bin converts the input to a bit string (with prefix \b)
- return aa[2:].rjust(numOfBits, "0")
- def bitToFormat(bitString):
- format0 = []
- if bitString[-1]=="1":
- format0.append('grassetto')
- if bitString[-2]=="1":
- format0.append('corsivo')
- if bitString[-3]=="1":
- format0.append('sottolineato')
- if bitString[-4]=="1":
- format0.append('barrato')
- if bitString[-5]=="1":
- format0.append('evidenziato')
- return format0
|