HTML code parsing and truncation
Many a times we run in to situations where we need to parse the HTML response text and truncate it. Truncation might be situational and needs might be different in every case. All we want is that when the HTML text is truncated, markup/HTML tags should be properly closed, else it will distort the UI. Below is the sample code which parses a given HTML text and truncates it based on the provided limit. While calculating the length of truncated text, length of markup/HTML tags will not be considered because those tags will not take space on UI/Web page (HTML tags are rendering tags). For example you have following HTML text:
<html>
<body>
<span>This is sample text, and I want to <b>truncate</b> it, can you please help me!
</body>
</html>
import java.util.StringTokenizer; import java.util.regex.Matcher;
<html>
<body>
<span>This is sample text, and I want to <b>truncate</b> it, can you please help me!
</body>
</html>
and you want to truncate it, so that the length of text displayed on HTML page should be 50, but you don't want length of HTML tags to be considered while calculating truncated text length. In this situation below code will help you.
import java.util.Iterator;
import java.util.Stack;
import java.util.regex.Pattern;
import com.localmatters.util.StringUtils;
/**
* A class which will format the given HTML string by preserving the order of start and end HTML tags
*/
public class HtmlTextTruncator {
/**
* This method will return a substring of HTML text based on provided limit by preserving the order of HTML tags.
* Length of HTML tags will not be considered while calculating the length of return string.
*/
@SuppressWarnings("unchecked")
public static String htmlSubString(String inputString, int limit) {
int actualTextLength = 0; // Text length without considering HTML tags
boolean isNewTag = false;
String htmlTagPattern = "<[^<^>]*>"; // Any HTML tag (start or end)
Pattern htmlStartTagPattern = Pattern.compile("<[^/^<^>]*>"); // only start tag
Pattern htmlEndTagPattern = Pattern.compile("]*>"); // only end tag
Stack tags = new Stack(); // Stack varibale used for pushing and poping up the HTML tags
StringBuilder message = new StringBuilder();
if(StringUtils.isNotEmpty(inputString) && limit > 0) {
// Create the regular expression based tokenizer
Iterator htmlTokenizer = new RETokenizer(inputString, htmlTagPattern, true);
// Get the tokens (and delimiters)
while(htmlTokenizer.hasNext()) {
String tokenOrDelim = (String)htmlTokenizer.next();
if(htmlStartTagPattern.matcher(tokenOrDelim).matches()) {
if (actualTextLength <> tags.push(tokenOrDelim); // add tag to stack
message.append(tokenOrDelim);
} else {
isNewTag = true;
}
} else if(htmlEndTagPattern.matcher(tokenOrDelim).matches()) {
if (!isNewTag) {
tags.pop(); // remove tag from stack
message.append(tokenOrDelim);
} else {
isNewTag = false;
}
} else if (actualTextLength <>
StringTokenizer textTockens = new StringTokenizer(tokenOrDelim, " ", true);
while(textTockens.hasMoreElements()) {
String word = textTockens.nextToken();
if(limit - actualTextLength > 0) {
message.append(word);
actualTextLength+=word.length();
} else {
break;
}
}
}
}
} else {
message.append(inputString);
}
return message.toString();
}
}
@SuppressWarnings("unchecked")
class RETokenizer implements Iterator {
private CharSequence input;
private Matcher matcher;
private boolean returnDelims;
private String delim;
private String match;
private int lastEnd = 0;
public RETokenizer(CharSequence input, String patternStr, boolean returnDelims) {
// Save values
this.input = input;
this.returnDelims = returnDelims;
// Compile pattern and prepare input
Pattern pattern = Pattern.compile(patternStr);
matcher = pattern.matcher(input);
}
// Returns true if there are more tokens or delimiters.
public boolean hasNext() {
if (matcher == null) {
return false;
}
if (delim != null || match != null) {
return true;
}
if (matcher.find()) {
if (returnDelims) {
delim = input.subSequence(lastEnd, matcher.start()).toString();
}
match = matcher.group();
lastEnd = matcher.end();
} else if (returnDelims && lastEnd < input.length()) {
delim = input.subSequence(lastEnd, input.length()).toString();
lastEnd = input.length();
// Need to remove the matcher since it appears to automatically
// reset itself once it reaches the end.
matcher = null;
}
return delim != null || match != null;
}
// Returns the next token (or delimiter if returnDelims is true).
public Object next() {
String result = null;
if (delim != null) {
result = delim;
delim = null;
} else if (match != null) {
result = match;
match = null;
}
return result;
}
public boolean isNextToken() {
return delim == null && match != null;
}
public void remove() {
throw new UnsupportedOperationException();
}
}
Comments