Crawler.java

/*
 * #%L
 * wcm.io
 * %%
 * Copyright (C) 2023 wcm.io
 * %%
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
 * You may obtain a copy of the License at
 *
 *      http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 * #L%
 */
package io.wcm.siteapi.integrationtestsupport.crawler;

import java.util.Collection;
import java.util.Collections;
import java.util.HashSet;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Set;

import org.jetbrains.annotations.NotNull;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import io.wcm.siteapi.integrationtestsupport.IntegrationTestContext;
import io.wcm.siteapi.integrationtestsupport.linkextractor.LinkExtractor;

/**
 * Generic Site API JSON content crawler.
 */
public final class Crawler {

  private final IntegrationTestContext context;
  private final Collection<LinkExtractor> linkExtractors;
  private final Set<String> visitedUrls = new HashSet<>();
  private final Set<String> failedUrls = new LinkedHashSet<>();
  private final UrlParser urlParser;

  private static final Logger log = LoggerFactory.getLogger(Crawler.class.getSimpleName());

  /**
   * @param context Integration test context
   * @param linkExtractors Link extractors to use for crawling links detected in JSON content.
   */
  public Crawler(@NotNull IntegrationTestContext context,
      @NotNull List<LinkExtractor> linkExtractors) {
    this.context = context;
    this.linkExtractors = Collections.unmodifiableCollection(linkExtractors);
    this.urlParser = new UrlParser(context.getPublishUrl(),
        context.getSelector(), context.getApiVersion(), context.getExtension());
  }

  /**
   * Start API crawling.
   * @param url API index URL to start crawling at.
   */
  public void start(@NotNull String url) {
    CrawlerItem item = new CrawlerItem(this, context, url);
    item.fetch();
  }

  /**
   * @return Total number of visited URLs.
   */
  public int numberOfVisits() {
    return visitedUrls.size();
  }

  /**
   * @return Number of failed visits.
   */
  public int numberOfFailedVisits() {
    return failedUrls.size();
  }

  /**
   * @return URLs of failed visits.
   */
  public @NotNull Collection<String> failedVisitUrls() {
    return Collections.unmodifiableCollection(failedUrls);
  }

  /**
   * Is called when a URL should be visited.
   * @param url URL to visit
   * @return true if the page was not already visited
   */
  boolean visitUrl(@NotNull String url) {
    boolean doVisit = visitedUrls.add(url);
    if (doVisit) {
      log.info("Visit: {}", url);
    }
    else {
      log.debug("Skip: {}", url);
    }
    return doVisit;
  }

  /**
   * Log a failed visit.
   * @param url url
   * @param message Message
   * @param fullResponse Full response
   */
  void logFailedVisitUrl(@NotNull String url, @NotNull String message, @NotNull String fullResponse) {
    log.error("Validation FAILED: {}\n{}\n\n{}\n", url, message, fullResponse);
    failedUrls.add(url);
  }

  @NotNull
  Collection<LinkExtractor> getLinkExtractors() {
    return this.linkExtractors;
  }

  @NotNull
  String parseSuffix(@NotNull String url) {
    return urlParser.parseSuffix(url);
  }

}