HTML code parsing and truncation

Many a times we run in to situations where we need to parse the HTML response text and truncate it. Truncation might be situational and needs might be different in every case. All we want is that when the HTML text is truncated, markup/HTML tags should be properly closed, else it will distort the UI. Below is the sample code which parses a given HTML text and truncates it based on the provided limit. While calculating the length of truncated text, length of markup/HTML tags will not be considered because those tags will not take space on UI/Web page (HTML tags are rendering tags). For example you have following HTML text:

<html>
<body>
<span>This is sample text, and I want to <b>truncate</b> it, can you please help me!
</body>
</html>

and you want to truncate it, so that the length of text displayed on HTML page should be 50, but you don't want length of HTML tags to be considered while calculating truncated text length. In this situation below code will help you. 


import java.util.Iterator;
import java.util.Stack;
import java.util.StringTokenizer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.localmatters.util.StringUtils;
/**

 * A class which will format the given HTML string by preserving the order of start and end HTML tags

 */
public class HtmlTextTruncator {
      /**

       * This method will return a substring of HTML text based on provided limit by preserving the order of HTML tags.

       * Length of HTML tags will not be considered while calculating the length of return string.

       */


      @SuppressWarnings("unchecked")
      public static String htmlSubString(String inputString, int limit) {
            int actualTextLength = 0; // Text length without considering HTML tags
            boolean isNewTag = false;
            String htmlTagPattern = "<[^<^>]*>"; // Any HTML tag (start or end)
            Pattern htmlStartTagPattern = Pattern.compile("<[^/^<^>]*>"); // only start tag
            Pattern htmlEndTagPattern = Pattern.compile("]*>"); // only end tag
            Stack tags = new Stack(); // Stack varibale used for pushing and poping up the HTML tags

            StringBuilder message = new StringBuilder();
            if(StringUtils.isNotEmpty(inputString) && limit > 0) {
                  // Create the regular expression based tokenizer
                  Iterator htmlTokenizer = new RETokenizer(inputString, htmlTagPattern, true);
                  // Get the tokens (and delimiters)
                  while(htmlTokenizer.hasNext()) {
                        String tokenOrDelim = (String)htmlTokenizer.next();
                        if(htmlStartTagPattern.matcher(tokenOrDelim).matches()) {
                              if (actualTextLength <> tags.push(tokenOrDelim); // add tag to stack
                              message.append(tokenOrDelim);
                        } else {
                              isNewTag = true;
                        }
                  } else if(htmlEndTagPattern.matcher(tokenOrDelim).matches()) {
                        if (!isNewTag) {
                              tags.pop(); // remove tag from stack
                              message.append(tokenOrDelim);
                        } else {
                              isNewTag = false;
                        }
                  } else if (actualTextLength <>
                  StringTokenizer textTockens = new StringTokenizer(tokenOrDelim, " ", true);
                  while(textTockens.hasMoreElements()) {
                        String word = textTockens.nextToken();
                        if(limit - actualTextLength > 0) {
                              message.append(word);
                              actualTextLength+=word.length();
                        } else {
                              break;
                        }
                  }
            }
      }
} else {
      message.append(inputString);
}

return message.toString();
}
}

@SuppressWarnings("unchecked")
class RETokenizer implements Iterator {
      private CharSequence input;
      private Matcher matcher;
      private boolean returnDelims;
      private String delim;
      private String match;
      private int lastEnd = 0;
      public RETokenizer(CharSequence input, String patternStr, boolean returnDelims) {
            // Save values
            this.input = input;
            this.returnDelims = returnDelims;
            // Compile pattern and prepare input
            Pattern pattern = Pattern.compile(patternStr);
            matcher = pattern.matcher(input);
      }

      // Returns true if there are more tokens or delimiters.
      public boolean hasNext() {
            if (matcher == null) {
                  return false;
            }

            if (delim != null || match != null) {
                  return true;
            }

            if (matcher.find()) {
                  if (returnDelims) {
                        delim = input.subSequence(lastEnd, matcher.start()).toString();
                  }
                  match = matcher.group();
                  lastEnd = matcher.end();
            } else if (returnDelims && lastEnd < input.length()) {
                  delim = input.subSequence(lastEnd, input.length()).toString();
                  lastEnd = input.length();
                  // Need to remove the matcher since it appears to automatically
                  // reset itself once it reaches the end.
                  matcher = null;
            }
            return delim != null || match != null;
      }

      // Returns the next token (or delimiter if returnDelims is true).
      public Object next() {
            String result = null;
            if (delim != null) {
                  result = delim;
                  delim = null;
            } else if (match != null) {
                  result = match;
                  match = null;
            }
            return result;
      }

      public boolean isNextToken() {
            return delim == null && match != null;
      }

      public void remove() {
            throw new UnsupportedOperationException();
      }
}

Comments

Popular posts from this blog

AEM - Query list of components and templates

AEM 6.3 - Bundle Whitelisting - Deprecation of administrative authentication

AEM as a Cloud Service (AEMaaCS) – Architecture Overview

AEM, FORM Submission & Handling POST requests