RichTextUtil.java
/*
* #%L
* wcm.io
* %%
* Copyright (C) 2014 wcm.io
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package io.wcm.handler.richtext.util;
import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;
import org.apache.commons.lang3.StringUtils;
import org.jdom2.Content;
import org.jdom2.Document;
import org.jdom2.Element;
import org.jdom2.JDOMConstants;
import org.jdom2.JDOMException;
import org.jdom2.Text;
import org.jdom2.input.SAXBuilder;
import org.jetbrains.annotations.NotNull;
import org.jetbrains.annotations.Nullable;
import org.osgi.annotation.versioning.ProviderType;
/**
* Utility methods for handling XHTML rich text fragments i.e. used for FCKEditor.
*/
@ProviderType
public final class RichTextUtil {
private RichTextUtil() {
// utility methods only
}
private static final int EMPTYTEXT_DEFAULT_TRESHOLD = 20;
private static final String XHTML_ENTITY_DEF =
"<!ENTITY % HTMLlat1 PUBLIC \"" + XHtmlResource.ENTITIES_LAT1.getPublicId() + "\" "
+ "\"" + XHtmlResource.ENTITIES_LAT1.getSystemId() + "\">"
+ "%HTMLlat1;"
+ "<!ENTITY % HTMLsymbol PUBLIC \"" + XHtmlResource.ENTITIES_SYMBOL.getPublicId() + "\" "
+ "\"" + XHtmlResource.ENTITIES_SYMBOL.getSystemId() + "\">"
+ "%HTMLsymbol;"
+ "<!ENTITY % HTMLspecial PUBLIC \"" + XHtmlResource.ENTITIES_SPECIAL.getPublicId() + "\" "
+ "\"" + XHtmlResource.ENTITIES_SPECIAL.getSystemId() + "\">"
+ "%HTMLspecial;";
/*
* Pattern that matches with all characters that are not allowed in XML 1.0 (https://www.w3.org/TR/REC-xml/#charsets).
* Actual regex pattern from https://stackoverflow.com/a/4237934
* Additionally to this list we remove "\r" to use only unix-style new lines.
*/
private static final Pattern CONTROL_CHARS_NOT_ALLOWED_IN_XML10 = Pattern.compile("[^"
+ "\u0009\n"
+ "\u0020-\uD7FF"
+ "\uE000-\uFFFD"
+ "\ud800\udc00-\udbff\udfff"
+ "]");
/**
* Check if the given formatted text block is empty.
* A text block containing only one paragraph element and whitespaces is considered as empty.
* A text block with more than 20 characters (raw data) is never considered as empty.
* @param text XHTML text string (root element not needed)
* @return true if text block is empty
*/
public static boolean isEmpty(@Nullable String text) {
return isEmpty(text, EMPTYTEXT_DEFAULT_TRESHOLD);
}
/**
* Check if the given formatted text block is empty.
* A text block containing only one paragraph element and whitespaces is considered as empty.
* A text block with more than pTreshold characters (raw data) is never considered as empty.
* @param text XHTML text string (root element not needed)
* @param treshold Treshold value - only strings with less than this number of characters are checked.
* @return true if text block is empty
*/
@SuppressWarnings("null")
public static boolean isEmpty(@Nullable String text, int treshold) {
// check if text block is really empty
if (StringUtils.isEmpty(text)) {
return true;
}
// check if text block has more than 20 chars
if (text.length() > treshold) {
return false;
}
// replace all whitespaces and nbsp's
String cleanedText = StringUtils.replace(text, " ", "");
cleanedText = StringUtils.replace(cleanedText, " ", "");
cleanedText = StringUtils.replace(cleanedText, " ", "");
cleanedText = StringUtils.replace(cleanedText, "\n", "");
cleanedText = StringUtils.replace(cleanedText, "\r", "");
return StringUtils.isEmpty(cleanedText) || "<p></p>".equals(cleanedText);
}
/**
* Parses XHTML text string, and adds to parsed content to the given parent element.
* @param parent Parent element to add parsed content to
* @param text XHTML text string (root element not needed)
* @throws JDOMException Is thrown if the text could not be parsed as XHTML
*/
public static void addParsedText(@NotNull Element parent, @NotNull String text) throws JDOMException {
addParsedText(parent, text, false);
}
/**
* Parses XHTML text string, and adds to parsed content to the given parent element.
* @param parent Parent element to add parsed content to
* @param text XHTML text string (root element not needed)
* @param xhtmlEntities If set to true, Resolving of XHtml entities in XHtml fragment is supported.
* @throws JDOMException Is thrown if the text could not be parsed as XHTML
*/
public static void addParsedText(@NotNull Element parent, @NotNull String text, boolean xhtmlEntities) throws JDOMException {
Element root = parseText(text, xhtmlEntities);
parent.addContent(root.cloneContent());
}
/**
* Parses XHTML text string. Adds a wrapping "root" element before parsing and returns this root element.
* @param text XHTML text string (root element not needed)
* @return Root element with parsed xhtml content
* @throws JDOMException Is thrown if the text could not be parsed as XHTML
*/
public static @NotNull Element parseText(@NotNull String text) throws JDOMException {
return parseText(text, false);
}
/**
* Parses XHTML text string. Adds a wrapping "root" element before parsing and returns this root element.
* @param text XHTML text string (root element not needed)
* @param xhtmlEntities If set to true, Resolving of XHtml entities in XHtml fragment is supported.
* @return Root element with parsed xhtml content
* @throws JDOMException Is thrown if the text could not be parsed as XHTML
*/
public static @NotNull Element parseText(@NotNull String text, boolean xhtmlEntities) throws JDOMException {
// add root element, remove invalid chars from input text
String xhtmlString =
(xhtmlEntities ? "<!DOCTYPE root [" + XHTML_ENTITY_DEF + "]>" : "")
+ "<root>" + removeCharsNotAllowedInXML10(text) + "</root>";
try {
SAXBuilder saxBuilder = new SAXBuilder();
if (xhtmlEntities) {
saxBuilder.setEntityResolver(XHtmlEntityResolver.getInstance());
}
// XXE prevention
saxBuilder.setFeature(JDOMConstants.SAX_FEATURE_EXTERNAL_ENT, false);
saxBuilder.setExpandEntities(false);
if (!xhtmlEntities) {
saxBuilder.setFeature("http://apache.org/xml/features/disallow-doctype-decl", true);
saxBuilder.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
}
Document doc = saxBuilder.build(new StringReader(xhtmlString));
return doc.getRootElement();
}
catch (IOException ex) {
throw new IllegalArgumentException("Error parsing XHTML fragment.", ex);
}
}
/**
* Removes all control chars and other chars which are not allowed in XML 1.0.
* See <a href="https://www.w3.org/TR/xml/#charsets">https://www.w3.org/TR/xml/#charsets</a>.
* @param value String which should be parsed to XML 1.0
* @return Cleaned up string
*/
private static String removeCharsNotAllowedInXML10(String value) {
return CONTROL_CHARS_NOT_ALLOWED_IN_XML10.matcher(value).replaceAll("");
}
/**
* Rewrites all children/sub-tree of the given parent element.
* For rewrite operations the given rewrite content handler is called.
* @param parent Parent element
* @param rewriteContentHandler Rewrite content handler
*/
@SuppressWarnings({
"PMD.EmptyControlStatement",
"java:S3776" // ignore complexity
})
public static void rewriteContent(@NotNull Element parent, @NotNull RewriteContentHandler rewriteContentHandler) {
// iterate through content list and build new content list
List<Content> originalContent = parent.getContent();
List<Content> newContent = new ArrayList<>();
for (Content contentElement : originalContent) {
// handle element
if (contentElement instanceof Element) {
Element element = (Element)contentElement;
// check if rewrite is needed for element
List<Content> rewriteContent = rewriteContentHandler.rewriteElement(element);
if (rewriteContent != null) {
// element was removed
if (rewriteContent.isEmpty()) {
// do not add to newContent
}
// element is the same - rewrite child elements
else if (rewriteContent.size() == 1 && rewriteContent.get(0) == element) { //NOPMD
rewriteContent(element, rewriteContentHandler);
newContent.add(element);
}
// element was replaced with other content - rewrite and add instead
else {
for (Content newContentItem : rewriteContent) {
if (newContentItem instanceof Element) {
Element newElement = (Element)newContentItem;
rewriteContent(newElement, rewriteContentHandler);
}
newContent.add(newContentItem.clone());
}
}
}
// nothing to rewrite - do nothing, but rewrite child element
else {
rewriteContent(element, rewriteContentHandler);
newContent.add(element);
}
}
// handle text node
else if (contentElement instanceof Text) {
Text text = (Text)contentElement;
// check if rewrite is needed for text node
List<Content> rewriteContent = rewriteContentHandler.rewriteText(text);
if (rewriteContent != null) {
// element was removed
if (rewriteContent.isEmpty()) {
// do not add to newContent
}
// element is the same - ignore
else if (rewriteContent.size() == 1 && rewriteContent.get(0) == text) { //NOPMD
// add original element
newContent.add(text);
}
// element was replaced with other content - add instead
else {
for (Content newContentItem : rewriteContent) {
newContent.add(newContentItem.clone());
}
}
}
// nothing to rewrite - do nothing, but add original text element
else {
newContent.add(text);
}
}
// unknown element - just add to new content
else {
newContent.add(contentElement);
}
}
// replace original content with new content
parent.removeContent();
parent.addContent(newContent);
}
}