RichTextInternalLinks.java
/*
* #%L
* wcm.io
* %%
* Copyright (C) 2023 wcm.io
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package io.wcm.siteapi.integrationtestsupport.linkextractor;
import static com.jayway.jsonpath.Criteria.where;
import static com.jayway.jsonpath.Filter.filter;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.stream.Stream;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
/**
* Extracts internal links from rich text fragments in JSON.
*
* <p>
* Looks for components with specific resource types and a "text" property containing the HTML fragment.
* Within the HTML fragments anchor tags with a attribute "data-type=internal" are processed, which is the
* default representation of internal links generated by <a href="https://wcm.io/handler/link/">wcm.io Link Handler</a>.
* </p>
*/
public final class RichTextInternalLinks implements LinkExtractor {
private static final Set<String> SUFFIXES = Set.of("content");
private final JsonPath jsonPath;
/**
* @param resourceTypes Resource type(s) for components containing rich text.
*/
public RichTextInternalLinks(String... resourceTypes) {
this(Arrays.asList(resourceTypes));
}
/**
* @param resourceTypes Resource type(s) for components containing rich text.
*/
public RichTextInternalLinks(List<String> resourceTypes) {
jsonPath = JsonPath.compile("$..[?]",
filter(where(":type").in(resourceTypes).and("text").exists(true)));
}
@Override
public boolean accept(String suffix) {
return SUFFIXES.contains(suffix);
}
@Override
public Stream<String> getLinks(DocumentContext jsonPathContext) {
// find all rich text components and parse rich text from "text" property
final List<Map<String, String>> richTextComponents = jsonPathContext.read(jsonPath);
return richTextComponents.stream()
.map(entry -> entry.get("text"))
.flatMap(this::extractLinksFromHtmlFragment);
}
private Stream<String> extractLinksFromHtmlFragment(String html) {
final Document document = Jsoup.parse(html);
return document.select("a[data-type='internal']").stream()
.map(a -> a.attr("href"));
}
}