Coverage Report

 /*
  * HtmlCollector.java -- structures an HTML document tree.  
  * Copyright (C) 1999 Quiotix Corporation.  
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License, version 2, as 
  * published by the Free Software Foundation.  
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  * for more details.
  */
 
 package com.quiotix.html.parser;
 
 import java.io.FileInputStream;
 import java.io.InputStream;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Set;
 import java.util.Vector;
 
 /**
  * An HtmlVisitor which modifies the structure of the document so that
  * begin tags are matched properly with end tags and placed in TagBlock
  * elements.  Typically, an HtmlDocument is created by the parser, which
  * simply returns a flat list of elements.  The HtmlCollector takes this
  * flat list and gives it the structure that is implied by the HTML content.
  *
  * @author Brian Goetz, Quiotix
  */
 
 public class HtmlCollector extends HtmlVisitor {
 
     protected ElementStack tagStack = new ElementStack();
     protected ElementStack elements;
     protected boolean collected;
     protected static Set dontMatch = new HashSet();
     protected static String[] dontMatchStrings
     = {"AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",
        "ISINDEX", "LINK", "META", "PARAM"};
 
     static {
         for (int i = 0; i < dontMatchStrings.length; i++)
             dontMatch.add(dontMatchStrings[i]);
     }
 
     private static class TagStackEntry {
         String tagName;
         int index;
     }
 
     private static class ElementStack extends Vector {
       private static final long serialVersionUID = 3718394150667677113L;
 
         ElementStack() {
             super();
         }
 
         ElementStack(int n) {
             super(n);
         }
 
         void popN(int n) {
             elementCount -= n;
         }
     }
 
     protected int pushNode(HtmlDocument.HtmlElement e) {
         elements.addElement(e);
         return elements.size() - 1;
     }
 
     public void visit(HtmlDocument.Comment c) {
         pushNode(c);
     }
 
     public void visit(HtmlDocument.Text t) {
         pushNode(t);
     }
 
     public void visit(HtmlDocument.Newline n) {
         pushNode(n);
     }
 
     public void visit(HtmlDocument.Tag t) {
         TagStackEntry ts = new TagStackEntry();
         int index;
 
         // Push the tag onto the element stack, and push an entry on the tag
         // stack if it's a tag we care about matching
         index = pushNode(t);
         if (!t.emptyTag
                 && !dontMatch.contains(t.tagName.toUpperCase())) {
             ts.tagName = t.tagName;
             ts.index = index;
             tagStack.addElement(ts);
         }
     }
 
     public void visit(HtmlDocument.EndTag t) {
         int i;
         for (i = tagStack.size() - 1; i >= 0; i--) {
             TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);
             if (t.tagName.equalsIgnoreCase(ts.tagName)) {
                 HtmlDocument.TagBlock block;
                 HtmlDocument.ElementSequence blockElements;
                 HtmlDocument.Tag tag;
 
                 // Create a new ElementSequence and copy the elements to it
                 blockElements =
                         new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);
                 for (int j = ts.index + 1; j < elements.size(); j++)
                     blockElements.addElement((HtmlDocument.HtmlElement)
                             elements.elementAt(j));
                 tag = (HtmlDocument.Tag) elements.elementAt(ts.index);
                 block = new HtmlDocument.TagBlock(tag.tagName,
                         tag.attributeList, blockElements);
 
                 // Pop the elements off the stack, push the new block
                 elements.popN(elements.size() - ts.index);
                 elements.addElement(block);
 
                 // Pop the matched tag and intervening unmatched tags
                 tagStack.popN(tagStack.size() - i);
 
                 collected = true;
                 break;
             }
         }
 
         // If we didn't find a match, just push the end tag
         if (i < 0)
             pushNode(t);
     }
 
     public void visit(HtmlDocument.TagBlock bl) {
         HtmlCollector c = new HtmlCollector();
 
         c.start();
         c.visit(bl.body);
         c.finish();
         pushNode(bl);
     }
 
     public void visit(HtmlDocument.ElementSequence s) {
         elements = new ElementStack(s.size());
         collected = false;
 
         for (Iterator iterator = s.iterator(); iterator.hasNext();) {
             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
             htmlElement.accept(this);
         }
         if (collected)
             s.setElements(elements);
     }
 
     /**
      * Runnable.
      */
     public static void main(String[] args) throws Exception {
         InputStream r = new FileInputStream(args[0]);
 
         try {
             HtmlDocument document = new HtmlParser(r).HtmlDocument();
             document.accept(new HtmlScrubber());
             document.accept(new HtmlCollector());
             document.accept(new HtmlDumper(System.out));
         } finally {
             r.close();
         }
     }
 }
 

1		/*
2		* HtmlCollector.java -- structures an HTML document tree.
3		* Copyright (C) 1999 Quiotix Corporation.
4		*
5		* This program is free software; you can redistribute it and/or modify
6		* it under the terms of the GNU General Public License, version 2, as
7		* published by the Free Software Foundation.
8		*
9		* This program is distributed in the hope that it will be useful,
10		* but WITHOUT ANY WARRANTY; without even the implied warranty of
11		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		* GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
13		* for more details.
14		*/
15
16		package com.quiotix.html.parser;
17
18		import java.io.FileInputStream;
19		import java.io.InputStream;
20		import java.util.HashSet;
21		import java.util.Iterator;
22		import java.util.Set;
23		import java.util.Vector;
24
25		/**
26		* An HtmlVisitor which modifies the structure of the document so that
27		* begin tags are matched properly with end tags and placed in TagBlock
28		* elements. Typically, an HtmlDocument is created by the parser, which
29		* simply returns a flat list of elements. The HtmlCollector takes this
30		* flat list and gives it the structure that is implied by the HTML content.
31		*
32		* @author Brian Goetz, Quiotix
33		*/
34
35	4	public class HtmlCollector extends HtmlVisitor {
36
37	4	protected ElementStack tagStack = new ElementStack();
38		protected ElementStack elements;
39		protected boolean collected;
40	2	protected static Set dontMatch = new HashSet();
41	2	protected static String[] dontMatchStrings
42		= {"AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",
43		"ISINDEX", "LINK", "META", "PARAM"};
44
45		static {
46	26	for (int i = 0; i < dontMatchStrings.length; i++)
47	24	dontMatch.add(dontMatchStrings[i]);
48	2	}
49
50	40	private static class TagStackEntry {
51		String tagName;
52		int index;
53		}
54
55	4	private static class ElementStack extends Vector {
56		private static final long serialVersionUID = 3718394150667677113L;
57
58		ElementStack() {
59	4	super();
60	4	}
61
62		ElementStack(int n) {
63	4	super(n);
64	4	}
65
66		void popN(int n) {
67	0	elementCount -= n;
68	0	}
69		}
70
71		protected int pushNode(HtmlDocument.HtmlElement e) {
72	32	elements.addElement(e);
73	32	return elements.size() - 1;
74		}
75
76		public void visit(HtmlDocument.Comment c) {
77	0	pushNode(c);
78	0	}
79
80		public void visit(HtmlDocument.Text t) {
81	8	pushNode(t);
82	8	}
83
84		public void visit(HtmlDocument.Newline n) {
85	4	pushNode(n);
86	4	}
87
88		public void visit(HtmlDocument.Tag t) {
89	20	TagStackEntry ts = new TagStackEntry();
90		int index;
91
92		// Push the tag onto the element stack, and push an entry on the tag
93		// stack if it's a tag we care about matching
94	20	index = pushNode(t);
95	20	if (!t.emptyTag
96		&& !dontMatch.contains(t.tagName.toUpperCase())) {
97	20	ts.tagName = t.tagName;
98	20	ts.index = index;
99	20	tagStack.addElement(ts);
100		}
101	20	}
102
103		public void visit(HtmlDocument.EndTag t) {
104		int i;
105	0	for (i = tagStack.size() - 1; i >= 0; i--) {
106	0	TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);
107	0	if (t.tagName.equalsIgnoreCase(ts.tagName)) {
108		HtmlDocument.TagBlock block;
109		HtmlDocument.ElementSequence blockElements;
110		HtmlDocument.Tag tag;
111
112		// Create a new ElementSequence and copy the elements to it
113	0	blockElements =
114		new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);
115	0	for (int j = ts.index + 1; j < elements.size(); j++)
116	0	blockElements.addElement((HtmlDocument.HtmlElement)
117		elements.elementAt(j));
118	0	tag = (HtmlDocument.Tag) elements.elementAt(ts.index);
119	0	block = new HtmlDocument.TagBlock(tag.tagName,
120		tag.attributeList, blockElements);
121
122		// Pop the elements off the stack, push the new block
123	0	elements.popN(elements.size() - ts.index);
124	0	elements.addElement(block);
125
126		// Pop the matched tag and intervening unmatched tags
127	0	tagStack.popN(tagStack.size() - i);
128
129	0	collected = true;
130	0	break;
131		}
132		}
133
134		// If we didn't find a match, just push the end tag
135	0	if (i < 0)
136	0	pushNode(t);
137	0	}
138
139		public void visit(HtmlDocument.TagBlock bl) {
140	0	HtmlCollector c = new HtmlCollector();
141
142	0	c.start();
143	0	c.visit(bl.body);
144	0	c.finish();
145	0	pushNode(bl);
146	0	}
147
148		public void visit(HtmlDocument.ElementSequence s) {
149	4	elements = new ElementStack(s.size());
150	4	collected = false;
151
152	4	for (Iterator iterator = s.iterator(); iterator.hasNext();) {
153	32	HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
154	32	htmlElement.accept(this);
155	32	}
156	4	if (collected)
157	0	s.setElements(elements);
158	4	}
159
160		/**
161		* Runnable.
162		*/
163		public static void main(String[] args) throws Exception {
164	0	InputStream r = new FileInputStream(args[0]);
165
166		try {
167	0	HtmlDocument document = new HtmlParser(r).HtmlDocument();
168	0	document.accept(new HtmlScrubber());
169	0	document.accept(new HtmlCollector());
170	0	document.accept(new HtmlDumper(System.out));
171		} finally {
172	0	r.close();
173	0	}
174	0	}
175		}
176