CrawlerItem.java
/*
* #%L
* wcm.io
* %%
* Copyright (C) 2023 wcm.io
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
package io.wcm.siteapi.integrationtestsupport.crawler;
import java.util.stream.Stream;
import org.jetbrains.annotations.NotNull;
import com.jayway.jsonpath.DocumentContext;
import com.jayway.jsonpath.JsonPath;
import io.wcm.siteapi.integrationtestsupport.IntegrationTestContext;
import io.wcm.siteapi.integrationtestsupport.httpclient.HttpRequestFailedException;
import io.wcm.siteapi.openapi.validator.ContentValidationException;
import io.wcm.siteapi.openapi.validator.OpenApiSchemaValidator;
/**
* Validates the JSON response of a single URL.
*/
class CrawlerItem {
private final Crawler crawler;
private final IntegrationTestContext context;
private final String url;
private final String refererUrl;
CrawlerItem(Crawler crawler, IntegrationTestContext context, String url) {
this(crawler, context, url, null);
}
CrawlerItem(Crawler crawler, IntegrationTestContext context, String url, String refererUrl) {
this.crawler = crawler;
this.context = context;
this.url = url;
this.refererUrl = refererUrl;
}
void fetch() {
// skip processing if page was already crawled
if (!crawler.visitUrl(url)) {
return;
}
// parse and validate URL
String suffix;
try {
suffix = crawler.parseSuffix(url);
}
catch (IllegalArgumentException ex) {
crawler.logFailedVisitUrl(url, appendReferer(ex.getMessage()), "");
return;
}
// load JSON from URL
OpenApiSchemaValidator validator = context.getValidator(suffix);
String json;
try {
json = context.getHttpClient().getBody(url);
}
catch (HttpRequestFailedException ex) {
crawler.logFailedVisitUrl(url, appendReferer(ex.getMessage()), "");
return;
}
// validate JSON against OAS3 spec
try {
validator.validate(json);
}
catch (ContentValidationException ex) {
crawler.logFailedVisitUrl(url, "Validator(" + context.getApiVersion() + "," + suffix + ") " + ex.getMessage(), json);
return;
}
// continue crawling with all link URLs found
getAllLinks(JsonPath.parse(json), suffix)
.forEach(followUrl -> new CrawlerItem(crawler, context, followUrl, url).fetch());
}
private Stream<String> getAllLinks(@NotNull DocumentContext jsonPathContext, @NotNull String suffix) {
return crawler.getLinkExtractors().stream()
.filter(extractor -> extractor.accept(suffix))
.flatMap(extractor -> extractor.getLinks(jsonPathContext))
.distinct();
}
private String appendReferer(String message) {
if (refererUrl != null) {
return message + ", refererer: " + refererUrl;
}
return message;
}
}