Merging XML files using XPath

This example is a solution to a commonplace XML problem – merging two XML files into one. Our two input XMLs will not have any indexes mapping one node to another but it is under the assumption that there is a one to one correspondence between the nodes of each XML based on their location in the XML file.
Instead of using user friendly XML parsers such as JDOM and XercesJ we use a less resource intensive light weight technology – XPath that comes bundled with the Java SDK.

Here is the first input XML:

XmlOne.xml

<?xml version="1.0" encoding="UTF-8"?>
<Mattiz>
    <Content>
        <Name>Jo</Name>
        <RollNumber>3</RollNumber>
    </Content>
    <Content>
        <Name>Jack</Name>
        <RollNumber>7</RollNumber>
    </Content>
    <Content>
        <Name>Harrison</Name>
        <RollNumber>14</RollNumber>
    </Content>
    <Content>
        <Name>Mike</Name>
        <RollNumber>26</RollNumber>
    </Content>
    <Content>
        <Name>Mick</Name>
        <RollNumber>98</RollNumber>
    </Content>
    <Content>
        <Name>Jake</Name>
        <RollNumber>101</RollNumber>
    </Content>
    <Content>
        <Name>Tintin</Name>
        <RollNumber>238</RollNumber>
    </Content>
    <Content>
        <Name>Goldie</Name>
        <RollNumber>500</RollNumber>
    </Content>
    <Content>
        <Name>Sommer</Name>
        <RollNumber>501</RollNumber>
    </Content>
    <Content>
        <Name>Hayley</Name>
        <RollNumber>567</RollNumber>
    </Content>
</Mattiz>

The second input XML – XmlTwo.xml

<?xml version="1.0" encoding="UTF-8"?>
<Mattiz>
    <Content>
        <Name>Jo</Name>
        <Age>7</Age>
    </Content>
    <Content>
        <Name>Jack</Name>
        <Age>10</Age>
    </Content>
    <Content>
        <Name>Harrison</Name>
        <Age>11</Age>
    </Content>
    <Content>
        <Name>Mike</Name>
        <Age>15</Age>
    </Content>
    <Content>
        <Name>Mick</Name>
        <Age>16</Age>
    </Content>
    <Content>
        <Name>Jake</Name>
        <Age>16</Age>
    </Content>
    <Content>
        <Name>Tintin</Name>
        <Age>18</Age>
    </Content>
    <Content>
        <Name>Goldie</Name>
        <Age>21</Age>
    </Content>
    <Content>
        <Name>Sommer</Name>
        <Age>71</Age>
    </Content>
    <Content>
        <Name>Hayley</Name>
        <Age>100</Age>
    </Content>
</Mattiz>

And here is the Main class that does the merging:
MergeXml.java

package com.mattiz.merge.xml;

import java.io.IOException;
import java.io.StringWriter;
import java.io.Writer;

import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.parsers.ParserConfigurationException;
import javax.xml.transform.Result;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerConfigurationException;
import javax.xml.transform.TransformerException;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import javax.xml.xpath.XPath;
import javax.xml.xpath.XPathConstants;
import javax.xml.xpath.XPathExpression;
import javax.xml.xpath.XPathExpressionException;
import javax.xml.xpath.XPathFactory;

import org.w3c.dom.Document;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;

public class MergeXml {

	public static void main(String args[]) throws ParserConfigurationException,
			SAXException, IOException, XPathExpressionException,
			TransformerException {
		XPathFactory xPathFactory = XPathFactory.newInstance();
		XPath xpath = xPathFactory.newXPath();
		DocumentBuilderFactory domFactory = DocumentBuilderFactory
				.newInstance();
		domFactory.setNamespaceAware(true);
		DocumentBuilder builder = domFactory.newDocumentBuilder();
		Document doc1 = builder.parse("./resources/XmlOne.xml");
		Document doc2 = builder.parse("./resources/XmlTwo.xml");
		String xPathString = "/Mattiz/Content";
		Document mergedDoc = passThroughXML(xpath, xPathString, doc1, doc2);
		System.out.println(getDocumentAsStringAfterTransformation(mergedDoc));
	}

	private static Document passThroughXML(XPath xpath, String xPathString,
			Document doc1, Document doc2) throws XPathExpressionException,
			TransformerException, IOException {
		Document document = null;
		XPathExpression xPathExpression = xpath.compile(xPathString);
		NodeList nodeList = (NodeList) xPathExpression.evaluate(doc2,
				XPathConstants.NODESET);
		for (int i = 0; i < nodeList.getLength(); i++) {
			XPathExpression innerXpathExpression = xpath.compile(xPathString
					.concat("[position()=" + (i + 1) + "]"));
			document = mergeXml(innerXpathExpression, doc1, doc2);
		}
		document = removeDuplicates(xPathString, xpath, document);
		return document;
	}

	private static Document mergeXml(XPathExpression innerXpathExpression,
			Document doc1, Document doc2) throws XPathExpressionException,
			IOException {
		Node doc1Node = (Node) innerXpathExpression.evaluate(doc1,
				XPathConstants.NODE);
		if (doc1Node == null) {
			throw new RuntimeException(doc1Node
					+ " - expression does not evaluate to a node");
		}
		Node doc2Node = (Node) innerXpathExpression.evaluate(doc2,
				XPathConstants.NODE);
		while (doc2Node.hasChildNodes()) {
			Node childNode = doc2Node.getFirstChild();
			doc2Node.removeChild(childNode);
			childNode = doc1.importNode(childNode, true);
			doc1Node.appendChild(childNode);
		}
		return doc1;
	}

	private static Document removeDuplicates(String xPathString, XPath xpath,
			Document document) throws XPathExpressionException,
			TransformerException {
		XPathExpression xPathExpression = xpath.compile(xPathString
				.concat("/Name[position()=2]"));
		NodeList nodeList = (NodeList) xPathExpression.evaluate(document,
				XPathConstants.NODESET);
		for (int i = 0; i < nodeList.getLength(); i++) {
			Node nodeToRemove = nodeList.item(i);
			nodeToRemove.getParentNode().removeChild(nodeToRemove);
		}
		return document;
	}

	private static String getDocumentAsStringAfterTransformation(Document document)
			throws TransformerConfigurationException, TransformerException {
		TransformerFactory transformerFactory = TransformerFactory
				.newInstance();
		Transformer transformer = transformerFactory.newTransformer();
		DOMSource source = new DOMSource(document);
		Writer outWriter = new StringWriter();
		Result result = new StreamResult(outWriter);
		transformer.transform(source, result);
		String stringDoc = outWriter.toString();
		return stringDoc;
	}
}

The output XML that is generated looks something like this on the console:

<?xml version="1.0" encoding="UTF-8" standalone="no"?>
  <Mattiz>
    <Content>
        <Name>Jo</Name>
        <RollNumber>3</RollNumber>        
        <Age>7</Age>
    </Content>
    <Content>
        <Name>Jack</Name>
        <RollNumber>7</RollNumber>        
        <Age>10</Age>
    </Content>
    <Content>
        <Name>Harrison</Name>
        <RollNumber>14</RollNumber>        
        <Age>11</Age>
    </Content>
    <Content>
        <Name>Mike</Name>
        <RollNumber>26</RollNumber>        
        <Age>15</Age>
    </Content>
    <Content>
        <Name>Mick</Name>
        <RollNumber>98</RollNumber>        
        <Age>16</Age>
    </Content>
    <Content>
        <Name>Jake</Name>
        <RollNumber>101</RollNumber>        
        <Age>16</Age>
    </Content>
    <Content>
        <Name>Tintin</Name>
        <RollNumber>238</RollNumber>        
        <Age>18</Age>
    </Content>
    <Content>
        <Name>Goldie</Name>
        <RollNumber>500</RollNumber>        
        <Age>21</Age>
    </Content>
    <Content>
        <Name>Sommer</Name>
        <RollNumber>501</RollNumber>
        <Age>71</Age>
    </Content>
    <Content>
        <Name>Hayley</Name>
        <RollNumber>567</RollNumber>       
        <Age>100</Age>
    </Content>
</Mattiz>

[purchase_link id=”0″ style=”” color=”” text=”Purchase”]file_struct

The source code can be downloaded here

About cuppajavamattiz
Matty Jacob - Avid technical blogger with interests in J2EE, Web Application Servers, Web frameworks, Open source libraries, Relational Databases, Web Services, Source control repositories, ETL, IDE Tools and related technologies.

Comments are closed.

%d bloggers like this: