extract.js

Summary

Tools for extracting XML from HTML using information in a special "xml" attribute. See the comment for the Instruction object for details on instruction format.

Version: 0.8 $Id: overview-summary-extract.js.html,v 1.1 2008/02/20 18:47:09 jameso Exp $

Author: James A. Overton


Class Summary
mozile.save.extract.Instruction  

/* ***** BEGIN LICENSE BLOCK *****
 * Licensed under Version: MPL 1.1/GPL 2.0/LGPL 2.1
 * Full Terms at http://mozile.mozdev.org/0.8/LICENSE
 *
 * Software distributed under the License is distributed on an "AS IS" basis,
 * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
 * for the specific language governing rights and limitations under the
 * License.
 *
 * The Original Code is James A. Overton's code (james@overton.ca).
 *
 * The Initial Developer of the Original Code is James A. Overton.
 * Portions created by the Initial Developer are Copyright (C) 2005-2006
 * the Initial Developer. All Rights Reserved.
 *
 * Contributor(s):
 *	James A. Overton <james@overton.ca>
 *
 * ***** END LICENSE BLOCK ***** */

/**
 * @fileoverview Tools for extracting XML from HTML using information in a special "xml" attribute. See the comment for the Instruction object for details on instruction format.
 * @link http://mozile.mozdev.org 
 * @author James A. Overton <james@overton.ca>
 * @version 0.8
 * $Id: overview-summary-extract.js.html,v 1.1 2008/02/20 18:47:09 jameso Exp $
 */

mozile.require("mozile.dom");
mozile.require("mozile.save");
mozile.provide("mozile.save.extract.*");

/**
 * Tools for extracting XML from HTML.
 * @type Object
 */
mozile.save.extract = new Object();
// JSDoc hack
mozile.save.extract.prototype = new mozile.Module;


/**
 * Extract XML from HTML. Checks a node for an "xml" attribute and follows the instructions listed there.
 * @param {Node} node The element to start with.
 * @param {Element} container The container to append extracted content to.
 * @type Element
 * @return The given container is returned with new content.
 */
mozile.save.extract.extract = function(node, container) {
	mozile.require("mozile.dom");
	if(!node) return null;
	var text, i;
	
	// Extract from element.
	if(node.nodeType == mozile.dom.ELEMENT_NODE) {
		var newContainer = container;

		// If it has an @xml attribute, use it.
		var xml = node.getAttribute("xml");
		if(xml) {
			var instructions = mozile.save.extract.parseInstructions(xml);
			newContainer = instructions[0].execute(node, container);
			for(i=1; i < instructions.length; i++) {
				instructions[i].execute(node, newContainer);
			}
			// If the children have been handled, return.
			if(instructions[0].getType() == "Set Attribute") return container;
		}
		
		// Handle child nodes.
		for(i=0; i < node.childNodes.length; i++) {
			mozile.save.extract.extract(node.childNodes[i], newContainer);
		}
	}
	
	/*
	// Extract from text.
	// Now handled by Instructions.
	if(node.nodeType == mozile.dom.TEXT_NODE &&
		!mozile.dom.isWhitespace(node) &&
		mozile.edit && mozile.edit.isEditable(node) ) {
		text = container.ownerDocument.createTextNode(node.data);
		container.appendChild(text);
	}
	*/

	return container;
}


/**
 * Takes the text content of an "xml" attribute and parses it into Instruction objects. Returns an array of the new Instructions.
 * @param {String} string Text of Instructions.
 * @type Array
 */
mozile.save.extract.parseInstructions = function(string) {
	var instructions = new Array();
	var instruction = new mozile.save.extract.Instruction;
	var mode = "target";
	var c;
	for(var i=0; i < string.length; i++) {
		c = string.charAt(i);
		
		// Handle assignment operators.
		if(c == "=") {
			i++;
			c = string.charAt(i);
			if(c == "'") {
				mode = "value";
				continue;
			}
			else mode = "select";
		}
		// Close quotes string.
		if(c == "'" && mode == "value") {
			mode = "target";
			continue;
		}
		// Handle white space separators.
		else if(c.match(/\s/) && mode != "value") {
			instructions.push(instruction);
			instruction = new mozile.save.extract.Instruction;
			mode = "target";
			continue;
		}
		
		// Append a character. Skip over escaped characters.
		instruction[mode] += c;
		if(c == "\\" && string.charAt(i+1)) {
			i++;
			instruction[mode] += string.charAt(i);
		}
	}
	instructions.push(instruction);

	return instructions;
}

/**
 * An Instruction object does the work of converting HTML to XML. It has a target which is an element or attribute in the XML output. It also has a selector which selects parts of the HTML input, or a string value. The target is assigned the value.
 * <p>Examples of instructions:
 * <ul>
 *   <li>Create an element: "newElement"
 *   <li>Assign all children to an element: "newElement=*"
 *   <li>Assign all text to an element: "newElement=text()"
 *   <li>Assign a string to an element: "newElement='string content'"
 *   <li>Assign a string to an attribute: "@newAttribute='string content'"
 *   <li>Assign a attribute to an attribute: "@newAttribute=@oldAttribute"
 * </ul>
 * @constructor
 */
mozile.save.extract.Instruction = function() {
	/**
	 * The XML target to use.
	 * @type String
	 */
	this.target = "";

	/**
	 * The HTML target to select.
	 * @type String
	 */
	this.select = "";

	/**
	 * A string to assign to the target.
	 * @type String
	 */
	this.value = "";
}

/**
 * A convenience method to dump the properties of this Instruction object.
 * @type String
 */
mozile.save.extract.Instruction.prototype.toString = function() {
	return ["Instruction", this.target, this.select, this.value].join(" :: ");
}


/**
 * Returns the type of this instruction.
 * TODO: Define types more clearly.
 * @type String
 */
mozile.save.extract.Instruction.prototype.getType = function() {
	if(!this.target) return null;
	if(this.value) return "Assign Value";
	if(this.select) return "Map Selection";
	if(this.target.charAt(0) == "@") return "Set Attribute";
	else return "Create Element";
}


/**
 * Does the work of manipulating the target. Takes an HTML element as its context and an XML container to attach new content to.
 * TODO: Clean up and generalize. Use proper XPaths.
 * @param {Element} element An HTML element to use as input.
 * @param {Element} container An XML element to attach output to.
 * @type Element
 * @return Either the given container or a newly created container is returned.
 */
mozile.save.extract.Instruction.prototype.execute = function(element, container) {
try {
	//alert(element +" "+ container +"\n"+ this.toString() );
	var useContainer = container;
	var target = this.target;
	var attribute, text;
	
	var value = this.value;
	if(this.select && this.select.charAt(0) == "@") {
		attribute = this.select.substring(1);
		value = element.getAttribute(attribute);
		if(!value && element[attribute] != undefined) 
			value = element[attribute];
	}

	// Follow a given path up the DOM tree.
	if(this.target.indexOf("/") != -1) {
		var ancestors = this.target.split("/");
		target = ancestors.pop();
		for(var i = ancestors.length - 1; i >= 0; i--) {
			if(ancestors[i] == "..") {
				useContainer = useContainer.parentNode;
				continue;
			}
			//while(useContainer) {
			//	if(useContainer.nodeName.toLowerCase() == ancestors[i]) break;
			//	useContainer = useContainer.parentNode;
			//}
			if(!useContainer) {
				alert("Aaagh");
				mozile.debug.debug("mozile.save.extract.Instruction.prototype.execute", "No node matching target: "+ this.target);
				return null;
			}
		}
	}

	// Assign element contents to attribute.
	if(target.charAt(0) == "@") {
		if(!this.value && 
			(!this.select || this.select == "*" || this.select == "text()") ) {
			value = mozile.dom.getText(element);
		}
	}
	// Create a new element.
	else if(target != "..") {
		var newContainer = mozile.dom.createElementNS(container.namespaceURI, target);
		useContainer.appendChild(newContainer);
		useContainer = newContainer;
		container = newContainer;
	}
	//if(target == "..") alert(element +" "+ container +"\n"+ element.getAttribute("xml") +"\n"+ this.toString() +"\n"+ target +" "+ value);
	
	// Assign a value.
	if(value) {
		// Assign to an attribute.
		if(target.charAt(0) == "@") {
			useContainer.setAttribute(target.substring(1), value);
		}
		// Assign to an element.
		else {
			text = useContainer.ownerDocument.createTextNode(value);
			useContainer.appendChild(text);
		}
	}
	
	if(target.charAt(0) != "@") {
		// Select all children.
		if(this.select == "*") {
			for(var i=0; i < element.childNodes.length; i++) {
				useContainer.appendChild(element.childNodes[i].cloneNode(true));
			}
		}
		// Select all text children.
		if(this.select == "text()") {
			for(var i=0; i < element.childNodes.length; i++) {
				if(element.childNodes[i].nodeType != mozile.dom.TEXT_NODE) continue;
				if(mozile.dom.isWhitespace(element.childNodes[i])) continue;
				useContainer.appendChild(element.childNodes[i].cloneNode(true));
			}
		}
	}

} catch(e) {
	alert(mozile.dumpError(e) +"\n"+ element +" "+ container +"\n"+ element.getAttribute("xml") +"\n"+ this.toString() +"\n"+ target +" "+ value);
}


	return container;
}


Documentation generated by JSDoc on Wed Feb 20 13:25:28 2008