import json import pandas as pd # Text highlighting def formatAllContexts(bibliocontexts: pd.DataFrame): index = 0 for col in bibliocontexts.columns: if col.startswith('pitxt'): if index == 0: bibliocontexts['formattazione contesto'] = bibliocontexts.apply (lambda row: addHighlightToFormatting(row['formattazione contesto'], int(row['pitxt'] - row['piniz']), int(row['elemlen'])), axis=1) else: bibliocontexts['formattazione contesto'] = bibliocontexts.apply (lambda row: addHighlightToFormatting(row['formattazione contesto'], int(row['pitxt_'+str(index)] - row['piniz']), int(row['elemlen_'+str(index)])), axis=1) index += 1 bibliocontexts['contesto formattato'] = bibliocontexts.apply (lambda row: formatContext(row['contesto'], row['formattazione contesto']), axis=1) return bibliocontexts # Define formatting information for a single context # Takes as input a non-formatted string + character-by-character format information def formatContext(context, formatCodesJson, strict=True): formatCodes = json.loads(formatCodesJson) if strict and len(formatCodes)!=len(context): raise ValueError("Formatting information does not match context") # Get formatting code CHANGES + first code formatChanges = [(0, formatCodes[0])] + [(ind, formatCodes[ind]) for ind in range(1, len(formatCodes)) if formatCodes[ind]!=formatCodes[ind-1]] # Define a dict containing string parts + format information for each part formattedContext = [] for ind, form in enumerate(formatChanges): format0 = [] if form[1]>0: format0 = bitToFormat(getBits(form[1])) start = form[0] end = formatChanges[ind+1][0] if ind < len(formatChanges)-1 else -1 try: stringPart = context[start:end] formattedContext.append( {'formatting': format0, 'stringPart': stringPart} ) except: pass return json.dumps(formattedContext) # For the sake of completeness, get an array of formatting codes back out of a formatted context def getFormattingCodes(formattedContext): toRet = [] for contextPart in formattedContext: stringPart = contextPart['stringPart'] code = getSingleFormattingCode(contextPart['formatting']) toRet = toRet + [code]*len(stringPart) return toRet # def getSingleFormattingCode(formatting): toRet = 0 if 'grassetto' in formatting: toRet += 1 if 'corsivo' in formatting: toRet += 2 if 'sottolineato' in formatting: toRet += 4 if 'barrato' in formatting: toRet += 8 if 'evidenziato' in formatting: toRet += 16 return toRet # Utility: modify a single format codes string to add highlighting def addHighlightToFormatting(formatCodesJson, highlightStart, highlightLength): formatCodes = json.loads(formatCodesJson) for index in range(highlightLength): try: formatCodes[highlightStart + index] += 16 except IndexError: pass return json.dumps(formatCodes) # Formatting decoders -- from numeric code to explicit formatting # Legend: ''' Formatting is set by (currently) five bits of information, codified together as an int: 0 (l.s.b.) = no formatting 1 = grassetto / bold 2 = corsivo / italic 4 = sottolineato / underline 8 = barrato / strikethroguh 16 (m.s.b.) = highlight Clearly, codes are ADDITIVE: for instance 28 = 16 + 8 + 4 stands for underlined, striked-through and highlighted text ''' def getBits(num): numOfBits = 5 aa = bin(num) # Bin converts the input to a bit string (with prefix \b) return aa[2:].rjust(numOfBits, "0") def bitToFormat(bitString): format0 = [] if bitString[-1]=="1": format0.append('grassetto') if bitString[-2]=="1": format0.append('corsivo') if bitString[-3]=="1": format0.append('sottolineato') if bitString[-4]=="1": format0.append('barrato') if bitString[-5]=="1": format0.append('evidenziato') return format0