HTML code parsing and truncation

Many a times we run in to situations where we need to parse the HTML response text and truncate it. Truncation might be situational and needs might be different in every case. All we want is that when the HTML text is truncated, markup/HTML tags should be properly closed, else it will distort the UI. Below is the sample code which parses a given HTML text and truncates it based on the provided limit. While calculating the length of truncated text, length of markup/HTML tags will not be considered because those tags will not take space on UI/Web page (HTML tags are rendering tags). For example you have following HTML text:

<html>
<body>
<span>This is sample text, and I want to <b>truncate</b> it, can you please help me!
</body>
</html>

and you want to truncate it, so that the length of text displayed on HTML page should be 50, but you don't want length of HTML tags to be considered while calculating truncated text length. In this situation below code will help you.

import java.util.Iterator;

import java.util.Stack;

import java.util.StringTokenizer;

import java.util.regex.Matcher;

import java.util.regex.Pattern;

import com.localmatters.util.StringUtils;

/**

* A class which will format the given HTML string by preserving the order of start and end HTML tags

public class HtmlTextTruncator {

/**

* This method will return a substring of HTML text based on provided limit by preserving the order of HTML tags.

* Length of HTML tags will not be considered while calculating the length of return string.

@SuppressWarnings("unchecked")

public static String htmlSubString(String inputString, int limit) {

int actualTextLength = 0; // Text length without considering HTML tags

boolean isNewTag = false;

String htmlTagPattern = "<[^<^>]*>"; // Any HTML tag (start or end)

Pattern htmlStartTagPattern = Pattern.compile("<[^/^<^>]*>"); // only start tag

Pattern htmlEndTagPattern = Pattern.compile("]*>"); // only end tag

Stack tags = new Stack(); // Stack varibale used for pushing and poping up the HTML tags

StringBuilder message = new StringBuilder();

if(StringUtils.isNotEmpty(inputString) && limit > 0) {

// Create the regular expression based tokenizer

Iterator htmlTokenizer = new RETokenizer(inputString, htmlTagPattern, true);

// Get the tokens (and delimiters)

while(htmlTokenizer.hasNext()) {

String tokenOrDelim = (String)htmlTokenizer.next();

if(htmlStartTagPattern.matcher(tokenOrDelim).matches()) {

if (actualTextLength <> tags.push(tokenOrDelim); // add tag to stack

message.append(tokenOrDelim);

} else {

isNewTag = true;

}

} else if(htmlEndTagPattern.matcher(tokenOrDelim).matches()) {

if (!isNewTag) {

tags.pop(); // remove tag from stack

message.append(tokenOrDelim);

} else {

isNewTag = false;

}

} else if (actualTextLength <>

StringTokenizer textTockens = new StringTokenizer(tokenOrDelim, " ", true);

while(textTockens.hasMoreElements()) {

String word = textTockens.nextToken();

if(limit - actualTextLength > 0) {

message.append(word);

actualTextLength+=word.length();

} else {

break;

}

} else {

message.append(inputString);

}

return message.toString();

}

@SuppressWarnings("unchecked")

class RETokenizer implements Iterator {

private CharSequence input;

private Matcher matcher;

private boolean returnDelims;

private String delim;

private String match;

private int lastEnd = 0;

public RETokenizer(CharSequence input, String patternStr, boolean returnDelims) {

// Save values

this.input = input;

this.returnDelims = returnDelims;

// Compile pattern and prepare input

Pattern pattern = Pattern.compile(patternStr);

matcher = pattern.matcher(input);

}

// Returns true if there are more tokens or delimiters.

public boolean hasNext() {

if (matcher == null) {

return false;

}

if (delim != null || match != null) {

return true;

}

if (matcher.find()) {

if (returnDelims) {

delim = input.subSequence(lastEnd, matcher.start()).toString();

}

match = matcher.group();

lastEnd = matcher.end();

} else if (returnDelims && lastEnd < input.length()) {

delim = input.subSequence(lastEnd, input.length()).toString();

lastEnd = input.length();

// Need to remove the matcher since it appears to automatically

// reset itself once it reaches the end.

matcher = null;

}

return delim != null || match != null;

}

// Returns the next token (or delimiter if returnDelims is true).

public Object next() {

String result = null;

if (delim != null) {

result = delim;

delim = null;

} else if (match != null) {

result = match;

match = null;

}

return result;

}

public boolean isNextToken() {

return delim == null && match != null;

}

public void remove() {

throw new UnsupportedOperationException();

}

HTML code parsing and truncation

Post a Comment

Streamline Your AI Development: The Power of an LLM Factory

Create an AEM (CQ) project using Maven

Categories

Main Tags

Gen AI

Popular Posts

Create an AEM (CQ) project using Maven

AEM as a Cloud Service (AEMaaCS) – Architecture Overview

Building a Gemini-Powered Browser Agent with LangChain and the Playwright MCP Server

Contact Form