View Javadoc
1   /*
2    * #%L
3    * wcm.io
4    * %%
5    * Copyright (C) 2014 wcm.io
6    * %%
7    * Licensed under the Apache License, Version 2.0 (the "License");
8    * you may not use this file except in compliance with the License.
9    * You may obtain a copy of the License at
10   *
11   *      http://www.apache.org/licenses/LICENSE-2.0
12   *
13   * Unless required by applicable law or agreed to in writing, software
14   * distributed under the License is distributed on an "AS IS" BASIS,
15   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16   * See the License for the specific language governing permissions and
17   * limitations under the License.
18   * #L%
19   */
20  package io.wcm.tooling.commons.packmgr.unpack;
21  
22  import static org.apache.jackrabbit.vault.util.Constants.DOT_CONTENT_XML;
23  import static org.apache.jackrabbit.vault.util.Constants.ROOT_DIR;
24  
25  import java.io.File;
26  import java.io.FileOutputStream;
27  import java.io.IOException;
28  import java.io.InputStream;
29  import java.io.OutputStream;
30  import java.util.ArrayList;
31  import java.util.Calendar;
32  import java.util.Enumeration;
33  import java.util.HashSet;
34  import java.util.LinkedHashSet;
35  import java.util.List;
36  import java.util.Set;
37  import java.util.TreeSet;
38  import java.util.concurrent.atomic.AtomicBoolean;
39  import java.util.regex.Matcher;
40  import java.util.regex.Pattern;
41  import java.util.regex.PatternSyntaxException;
42  
43  import javax.jcr.PropertyType;
44  import javax.xml.XMLConstants;
45  import javax.xml.parsers.ParserConfigurationException;
46  import javax.xml.parsers.SAXParser;
47  import javax.xml.parsers.SAXParserFactory;
48  
49  import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
50  import org.apache.commons.compress.archivers.zip.ZipFile;
51  import org.apache.commons.io.FileUtils;
52  import org.apache.commons.io.FilenameUtils;
53  import org.apache.commons.io.IOUtils;
54  import org.apache.commons.lang3.StringUtils;
55  import org.apache.jackrabbit.JcrConstants;
56  import org.apache.jackrabbit.util.ISO8601;
57  import org.apache.jackrabbit.vault.fs.io.DocViewFormat;
58  import org.apache.jackrabbit.vault.util.PlatformNameFormat;
59  import org.jdom2.Attribute;
60  import org.jdom2.Document;
61  import org.jdom2.Element;
62  import org.jdom2.JDOMException;
63  import org.jdom2.Namespace;
64  import org.jdom2.input.SAXBuilder;
65  import org.jdom2.output.Format;
66  import org.jdom2.output.LineSeparator;
67  import org.jdom2.output.XMLOutputter;
68  import org.xml.sax.Attributes;
69  import org.xml.sax.SAXException;
70  import org.xml.sax.helpers.DefaultHandler;
71  
72  import edu.umd.cs.findbugs.annotations.SuppressFBWarnings;
73  import io.wcm.tooling.commons.packmgr.PackageManagerException;
74  
75  /**
76   * Manages unpacking ZIP file content applying exclude patterns.
77   */
78  public final class ContentUnpacker {
79  
80    private static final String MIXINS_PROPERTY = "jcr:mixinTypes";
81    private static final String PRIMARYTYPE_PROPERTY = "jcr:primaryType";
82    private static final Namespace JCR_NAMESPACE = Namespace.getNamespace("jcr", "http://www.jcp.org/jcr/1.0");
83    private static final Namespace CQ_NAMESPACE = Namespace.getNamespace("cq", "http://www.day.com/jcr/cq/1.0");
84    private static final Pattern FILENAME_NAMESPACE_PATTERN = Pattern.compile("^([^:]+):(.+)$");
85  
86    private static final SAXParserFactory SAX_PARSER_FACTORY;
87    static {
88      SAX_PARSER_FACTORY = SAXParserFactory.newInstance();
89      SAX_PARSER_FACTORY.setNamespaceAware(true);
90    }
91  
92    private static final DocViewFormat DOCVIEWFORMAT = new DocViewFormat();
93  
94    private final Pattern[] excludeFiles;
95    private final Pattern[] excludeNodes;
96    private final Pattern[] excludeProperties;
97    private final Pattern[] excludeMixins;
98    private final boolean markReplicationActivated;
99    private final Pattern[] markReplicationActivatedIncludeNodes;
100   private final String dateLastReplicated;
101 
102   /**
103    * @param properties Configuration properties
104    */
105   public ContentUnpacker(ContentUnpackerProperties properties) {
106     this.excludeFiles = toPatternArray(properties.getExcludeFiles());
107     this.excludeNodes = toPatternArray(properties.getExcludeNodes());
108     this.excludeProperties = toPatternArray(properties.getExcludeProperties());
109     this.excludeMixins = toPatternArray(properties.getExcludeMixins());
110     this.markReplicationActivated = properties.isMarkReplicationActivated();
111     this.markReplicationActivatedIncludeNodes = toPatternArray(properties.getMarkReplicationActivatedIncludeNodes());
112 
113     if (StringUtils.isNotBlank(properties.getDateLastReplicated())) {
114       this.dateLastReplicated = properties.getDateLastReplicated();
115     }
116     else {
117       // set to current date
118       Calendar cal = Calendar.getInstance();
119       cal.set(Calendar.HOUR_OF_DAY, 0);
120       cal.set(Calendar.MINUTE, 0);
121       cal.set(Calendar.SECOND, 0);
122       cal.set(Calendar.MILLISECOND, 0);
123       this.dateLastReplicated = ISO8601.format(cal);
124     }
125   }
126 
127   private static Pattern[] toPatternArray(String[] patternStrings) {
128     if (patternStrings == null) {
129       return new Pattern[0];
130     }
131     Pattern[] patterns = new Pattern[patternStrings.length];
132     for (int i = 0; i < patternStrings.length; i++) {
133       try {
134         patterns[i] = Pattern.compile(patternStrings[i]);
135       }
136       catch (PatternSyntaxException ex) {
137         throw new PackageManagerException("Invalid regexp pattern: " + patternStrings[i], ex);
138       }
139     }
140     return patterns;
141   }
142 
143   private static boolean matches(String name, Pattern[] patterns, boolean defaultIfNotPatternsDefined) {
144     if (patterns.length == 0) {
145       return defaultIfNotPatternsDefined;
146     }
147     for (Pattern pattern : patterns) {
148       if (pattern.matcher(name).matches()) {
149         return true;
150       }
151     }
152     return false;
153   }
154 
155   private boolean applyXmlExcludes(String name) {
156     if (this.excludeNodes.length == 0 && this.excludeProperties.length == 0) {
157       return false;
158     }
159     return isJcrContentXmlFile(name);
160   }
161 
162   private boolean isJcrContentXmlFile(String name) {
163     return StringUtils.equalsIgnoreCase(FilenameUtils.getExtension(name), "xml")
164         && StringUtils.startsWith(name, "jcr_root/");
165   }
166 
167   /**
168    * Unpacks file
169    * @param file File
170    * @param outputDirectory Output directory
171    */
172   public void unpack(File file, File outputDirectory) {
173     try (ZipFile zipFile = new ZipFile.Builder().setFile(file).get()) {
174       Enumeration<ZipArchiveEntry> entries = zipFile.getEntries();
175       while (entries.hasMoreElements()) {
176         ZipArchiveEntry entry = entries.nextElement();
177         if (!matches(entry.getName(), excludeFiles, false)) {
178           unpackEntry(zipFile, entry, outputDirectory);
179         }
180       }
181     }
182     catch (IOException ex) {
183       throw new PackageManagerException("Error reading content package " + file.getAbsolutePath(), ex);
184     }
185   }
186 
187   @SuppressFBWarnings("RV_RETURN_VALUE_IGNORED_BAD_PRACTICE")
188   private void unpackEntry(ZipFile zipFile, ZipArchiveEntry entry, File outputDirectory) throws IOException {
189     if (entry.isDirectory()) {
190       File directory = FileUtils.getFile(outputDirectory, entry.getName());
191       directory.mkdirs();
192     }
193     else {
194       Set<String> namespacePrefixes = null;
195       if (applyXmlExcludes(entry.getName())) {
196         namespacePrefixes = getNamespacePrefixes(zipFile, entry);
197       }
198 
199       try (InputStream entryStream = zipFile.getInputStream(entry)) {
200         File outputFile = FileUtils.getFile(outputDirectory, entry.getName());
201         if (outputFile.exists()) {
202           outputFile.delete();
203         }
204         File directory = outputFile.getParentFile();
205         directory.mkdirs();
206 
207         try (FileOutputStream fos = new FileOutputStream(outputFile)) {
208           if (applyXmlExcludes(entry.getName()) && namespacePrefixes != null) {
209             // write file with XML filtering
210             try {
211               writeXmlWithExcludes(entry, entryStream, fos, namespacePrefixes);
212             }
213             catch (JDOMException ex) {
214               throw new PackageManagerException("Unable to parse XML file: " + entry.getName(), ex);
215             }
216           }
217           else {
218             // write file directly without XML filtering
219             IOUtils.copy(entryStream, fos);
220           }
221         }
222         if (isJcrContentXmlFile(entry.getName())) {
223           // format output file using DocView format
224           try {
225             DOCVIEWFORMAT.format(outputFile, false);
226           }
227           catch (IOException ex) {
228             throw new IOException("Unable to apply DocView format to file: " + outputFile.getAbsolutePath(), ex);
229           }
230         }
231       }
232     }
233   }
234 
235   /**
236    * Parses XML file with namespace-aware SAX parser to get defined namespaces prefixes in order of appearance
237    * (to keep the same order when outputting the XML file again).
238    * @param zipFile ZIP file
239    * @param entry ZIP entry
240    * @return Ordered set with namespace prefixes in correct order.
241    *         Returns null if given XML file does not contain FileVault XML content.
242    */
243   private Set<String> getNamespacePrefixes(ZipFile zipFile, ZipArchiveEntry entry) throws IOException {
244     try (InputStream entryStream = zipFile.getInputStream(entry)) {
245       SAXParser parser = SAX_PARSER_FACTORY.newSAXParser();
246       final Set<String> prefixes = new LinkedHashSet<>();
247 
248       final AtomicBoolean foundRootElement = new AtomicBoolean(false);
249       DefaultHandler handler = new DefaultHandler() {
250         @Override
251         public void startElement(String uri, String localName, String qName, Attributes attributes) throws SAXException {
252           // validate that XML file contains FileVault XML content
253           if (StringUtils.equals(uri, JCR_NAMESPACE.getURI()) && StringUtils.equals(localName, "root")) {
254             foundRootElement.set(true);
255           }
256         }
257         @Override
258         public void startPrefixMapping(String prefix, String uri) throws SAXException {
259           if (StringUtils.isNotBlank(prefix)) {
260             prefixes.add(prefix);
261           }
262         }
263       };
264       parser.parse(entryStream, handler);
265 
266       if (!foundRootElement.get()) {
267         return null;
268       }
269       else {
270         return prefixes;
271       }
272     }
273     catch (IOException | SAXException | ParserConfigurationException ex) {
274       throw new IOException("Error parsing " + entry.getName(), ex);
275     }
276   }
277 
278   private void writeXmlWithExcludes(ZipArchiveEntry entry, InputStream inputStream, OutputStream outputStream, Set<String> namespacePrefixes)
279       throws IOException, JDOMException {
280     SAXBuilder saxBuilder = new SAXBuilder();
281     saxBuilder.setProperty(XMLConstants.ACCESS_EXTERNAL_DTD, "");
282     saxBuilder.setProperty(XMLConstants.ACCESS_EXTERNAL_SCHEMA, "");
283     Document doc = saxBuilder.build(inputStream);
284 
285     Set<String> namespacePrefixesActuallyUsed = new HashSet<>();
286 
287     // check for namespace prefix in file name
288     String namespacePrefix = getNamespacePrefix(entry.getName());
289     if (namespacePrefix != null) {
290       namespacePrefixesActuallyUsed.add(namespacePrefix);
291     }
292 
293     applyXmlExcludes(doc.getRootElement(), getParentPath(entry), namespacePrefixesActuallyUsed, false);
294 
295     XMLOutputter outputter = new XMLOutputter(Format.getPrettyFormat()
296         .setIndent("    ")
297         .setLineSeparator(LineSeparator.UNIX));
298     outputter.setXMLOutputProcessor(new NamspaceOrderedXmlProcessor(namespacePrefixes, namespacePrefixesActuallyUsed));
299     outputter.output(doc, outputStream);
300     outputStream.flush();
301   }
302 
303   static String getNamespacePrefix(String path) {
304     String fileName = FilenameUtils.getName(path);
305     if (StringUtils.equals(DOT_CONTENT_XML, fileName)) {
306       String parentFolderName = FilenameUtils.getName(FilenameUtils.getPathNoEndSeparator(path));
307       if (parentFolderName != null) {
308         String nodeName = PlatformNameFormat.getRepositoryName(parentFolderName);
309         Matcher matcher = FILENAME_NAMESPACE_PATTERN.matcher(nodeName);
310         if (matcher.matches()) {
311           return matcher.group(1);
312         }
313       }
314     }
315     return null;
316   }
317 
318   private String getParentPath(ZipArchiveEntry entry) {
319     return StringUtils.removeEnd(StringUtils.removeStart(entry.getName(), ROOT_DIR), "/" + DOT_CONTENT_XML);
320   }
321 
322   private String buildElementPath(Element element, String parentPath) {
323     StringBuilder path = new StringBuilder(parentPath);
324     if (!StringUtils.equals(element.getQualifiedName(), "jcr:root")) {
325       path.append("/").append(element.getQualifiedName());
326     }
327     return path.toString();
328   }
329 
330   @SuppressWarnings("PMD.EmptyControlStatement")
331   private void applyXmlExcludes(Element element, String parentPath, Set<String> namespacePrefixesActuallyUsed,
332       boolean insideReplicationElement) {
333     String path = buildElementPath(element, parentPath);
334     if (matches(path, this.excludeNodes, false)) {
335       element.detach();
336       return;
337     }
338     collectNamespacePrefix(namespacePrefixesActuallyUsed, element.getNamespacePrefix());
339 
340     String jcrPrimaryType = element.getAttributeValue("primaryType", JCR_NAMESPACE);
341     boolean isRepositoryUserGroup = StringUtils.equals(jcrPrimaryType, "rep:User") || StringUtils.equals(jcrPrimaryType, "rep:Group");
342     boolean isReplicationElement = StringUtils.equals(jcrPrimaryType, "cq:Page")
343         || StringUtils.equals(jcrPrimaryType, "dam:Asset")
344         || StringUtils.equals(jcrPrimaryType, "cq:Template");
345     boolean isContent = insideReplicationElement && StringUtils.equals(element.getQualifiedName(), "jcr:content");
346     boolean setReplicationAttributes = isContent && markReplicationActivated;
347 
348     List<Attribute> attributes = new ArrayList<>(element.getAttributes());
349     for (Attribute attribute : attributes) {
350       boolean excluded = false;
351       if (matches(attribute.getQualifiedName(), this.excludeProperties, false)) {
352         if (isRepositoryUserGroup && StringUtils.equals(attribute.getQualifiedName(), JcrConstants.JCR_UUID)) {
353           // keep jcr:uuid property for groups and users, otherwise they cannot be imported again
354         }
355         else {
356           attribute.detach();
357           excluded = true;
358         }
359       }
360       else if (StringUtils.equals(attribute.getQualifiedName(), PRIMARYTYPE_PROPERTY)) {
361         String namespacePrefix = StringUtils.substringBefore(attribute.getValue(), ":");
362         collectNamespacePrefix(namespacePrefixesActuallyUsed, namespacePrefix);
363       }
364       else if (StringUtils.equals(attribute.getQualifiedName(), MIXINS_PROPERTY)) {
365         String filteredValue = filterMixinsPropertyValue(attribute.getValue(), namespacePrefixesActuallyUsed);
366         if (StringUtils.isBlank(filteredValue)) {
367           attribute.detach();
368         }
369         else {
370           attribute.setValue(filteredValue);
371         }
372       }
373       else if (StringUtils.startsWith(attribute.getValue(), "{Name}")) {
374         collectNamespacePrefixNameArray(namespacePrefixesActuallyUsed, attribute.getValue());
375         // alphabetically sort name values
376         attribute.setValue(sortReferenceValues(attribute.getValue(), PropertyType.NAME));
377       }
378       else if (StringUtils.startsWith(attribute.getValue(), "{WeakReference}")) {
379         // alphabetically sort weak reference values
380         attribute.setValue(sortReferenceValues(attribute.getValue(), PropertyType.WEAKREFERENCE));
381       }
382       if (!excluded) {
383         collectNamespacePrefix(namespacePrefixesActuallyUsed, attribute.getNamespacePrefix());
384       }
385     }
386 
387     // set replication status for jcr:content nodes inside cq:Page nodes
388     if (setReplicationAttributes && matches(path, markReplicationActivatedIncludeNodes, true)) {
389       addMixin(element, "cq:ReplicationStatus");
390       element.setAttribute("lastReplicated", "{Date}" + dateLastReplicated, CQ_NAMESPACE);
391       element.setAttribute("lastReplicationAction", "Activate", CQ_NAMESPACE);
392       collectNamespacePrefix(namespacePrefixesActuallyUsed, CQ_NAMESPACE.getPrefix());
393     }
394 
395     // if current element is a replication element, but the jcr:content node to set the replication attributes to is missing, add it
396     if (isReplicationElement && element.getChild("content", JCR_NAMESPACE) == null
397         && matches(path + "/jcr:content", markReplicationActivatedIncludeNodes, true)) {
398       Element contentNode = new Element("content", JCR_NAMESPACE);
399       String jcrContentPrimaryType = StringUtils.equals(jcrPrimaryType, "cq:Template") ? "cq:PageContent" : jcrPrimaryType + "Content";
400       contentNode.setAttribute("primaryType", jcrContentPrimaryType, JCR_NAMESPACE);
401       element.addContent(contentNode);
402     }
403 
404     List<Element> children = new ArrayList<>(element.getChildren());
405     for (Element child : children) {
406       applyXmlExcludes(child, path, namespacePrefixesActuallyUsed, (insideReplicationElement || isReplicationElement) && !isContent);
407     }
408   }
409 
410   private String filterMixinsPropertyValue(String value, Set<String> namespacePrefixesActuallyUsed) {
411     if (this.excludeMixins.length == 0 || StringUtils.isBlank(value)) {
412       return value;
413     }
414 
415     List<String> mixins = new ArrayList<>();
416     for (String mixin : DocViewUtil.parseValues(value)) {
417       if (!matches(mixin, this.excludeMixins, false)) {
418         String namespacePrefix = StringUtils.substringBefore(mixin, ":");
419         collectNamespacePrefix(namespacePrefixesActuallyUsed, namespacePrefix);
420         mixins.add(mixin);
421       }
422     }
423 
424     if (mixins.isEmpty()) {
425       return null;
426     }
427 
428     return DocViewUtil.formatValues(mixins);
429   }
430 
431   private void addMixin(Element element, String mixin) {
432     String mixinsString = element.getAttributeValue("mixinTypes", JCR_NAMESPACE);
433 
434     List<String> mixins = new ArrayList<>();
435     if (!StringUtils.isBlank(mixinsString)) {
436       for (String item : DocViewUtil.parseValues(mixinsString)) {
437         mixins.add(item);
438       }
439     }
440     if (!mixins.contains(mixin)) {
441       mixins.add(mixin);
442     }
443 
444     element.setAttribute("mixinTypes", DocViewUtil.formatValues(mixins), JCR_NAMESPACE);
445   }
446 
447   private void collectNamespacePrefix(Set<String> prefixes, String prefix) {
448     if (StringUtils.isNotBlank(prefix)) {
449       prefixes.add(prefix);
450     }
451   }
452 
453   private void collectNamespacePrefixNameArray(Set<String> prefixes, String value) {
454     for (String item : DocViewUtil.parseValues(value)) {
455       String namespacePrefix = StringUtils.substringBefore(item, ":");
456       collectNamespacePrefix(prefixes, namespacePrefix);
457     }
458   }
459 
460   /**
461    * Sort weak reference values alphabetically to ensure consistent ordering.
462    * @param value Property value
463    * @param propertyType Property type from {@link PropertyType}
464    * @return Property value with sorted references
465    */
466   private String sortReferenceValues(String value, int propertyType) {
467     Set<String> refs = new TreeSet<>();
468     for (String item : DocViewUtil.parseValues(value)) {
469       refs.add(item);
470     }
471     return DocViewUtil.formatValues(new ArrayList<>(refs), propertyType);
472   }
473 
474 }