View Javadoc

1   /*
2    * HtmlCollector.java -- structures an HTML document tree.  
3    * Copyright (C) 1999 Quiotix Corporation.  
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License, version 2, as 
7    * published by the Free Software Foundation.  
8    *
9    * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
13   * for more details.
14   */
15  
16  package com.quiotix.html.parser;
17  
18  import java.io.FileInputStream;
19  import java.io.InputStream;
20  import java.util.HashSet;
21  import java.util.Iterator;
22  import java.util.Set;
23  import java.util.Vector;
24  
25  /**
26   * An HtmlVisitor which modifies the structure of the document so that
27   * begin tags are matched properly with end tags and placed in TagBlock
28   * elements.  Typically, an HtmlDocument is created by the parser, which
29   * simply returns a flat list of elements.  The HtmlCollector takes this
30   * flat list and gives it the structure that is implied by the HTML content.
31   *
32   * @author Brian Goetz, Quiotix
33   */
34  
35  public class HtmlCollector extends HtmlVisitor {
36  
37      protected ElementStack tagStack = new ElementStack();
38      protected ElementStack elements;
39      protected boolean collected;
40      protected static Set dontMatch = new HashSet();
41      protected static String[] dontMatchStrings
42      = {"AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",
43         "ISINDEX", "LINK", "META", "PARAM"};
44  
45      static {
46          for (int i = 0; i < dontMatchStrings.length; i++)
47              dontMatch.add(dontMatchStrings[i]);
48      }
49  
50      private static class TagStackEntry {
51          String tagName;
52          int index;
53      }
54  
55      private static class ElementStack extends Vector {
56  
57          ElementStack() {
58              super();
59          }
60  
61          ElementStack(int n) {
62              super(n);
63          }
64  
65          void popN(int n) {
66              elementCount -= n;
67          }
68      }
69  
70      protected int pushNode(HtmlDocument.HtmlElement e) {
71          elements.addElement(e);
72          return elements.size() - 1;
73      }
74  
75      public void visit(HtmlDocument.Comment c) {
76          pushNode(c);
77      }
78  
79      public void visit(HtmlDocument.Text t) {
80          pushNode(t);
81      }
82  
83      public void visit(HtmlDocument.Newline n) {
84          pushNode(n);
85      }
86  
87      public void visit(HtmlDocument.Tag t) {
88          TagStackEntry ts = new TagStackEntry();
89          int index;
90  
91          // Push the tag onto the element stack, and push an entry on the tag
92          // stack if it's a tag we care about matching
93          index = pushNode(t);
94          if (!t.emptyTag
95                  && !dontMatch.contains(t.tagName.toUpperCase())) {
96              ts.tagName = t.tagName;
97              ts.index = index;
98              tagStack.addElement(ts);
99          }
100     }
101 
102     public void visit(HtmlDocument.EndTag t) {
103         int i;
104         for (i = tagStack.size() - 1; i >= 0; i--) {
105             TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);
106             if (t.tagName.equalsIgnoreCase(ts.tagName)) {
107                 HtmlDocument.TagBlock block;
108                 HtmlDocument.ElementSequence blockElements;
109                 HtmlDocument.Tag tag;
110 
111                 // Create a new ElementSequence and copy the elements to it
112                 blockElements =
113                         new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);
114                 for (int j = ts.index + 1; j < elements.size(); j++)
115                     blockElements.addElement((HtmlDocument.HtmlElement)
116                             elements.elementAt(j));
117                 tag = (HtmlDocument.Tag) elements.elementAt(ts.index);
118                 block = new HtmlDocument.TagBlock(tag.tagName,
119                         tag.attributeList, blockElements);
120 
121                 // Pop the elements off the stack, push the new block
122                 elements.popN(elements.size() - ts.index);
123                 elements.addElement(block);
124 
125                 // Pop the matched tag and intervening unmatched tags
126                 tagStack.popN(tagStack.size() - i);
127 
128                 collected = true;
129                 break;
130             }
131         }
132 
133         // If we didn't find a match, just push the end tag
134         if (i < 0)
135             pushNode(t);
136     }
137 
138     public void visit(HtmlDocument.TagBlock bl) {
139         HtmlCollector c = new HtmlCollector();
140 
141         c.start();
142         c.visit(bl.body);
143         c.finish();
144         pushNode(bl);
145     }
146 
147     public void visit(HtmlDocument.ElementSequence s) {
148         elements = new ElementStack(s.size());
149         collected = false;
150 
151         for (Iterator iterator = s.iterator(); iterator.hasNext();) {
152             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
153             htmlElement.accept(this);
154         }
155         if (collected)
156             s.setElements(elements);
157     }
158 
159     /**
160      * Runnable.
161      */
162     public static void main(String[] args) throws Exception {
163         InputStream r = new FileInputStream(args[0]);
164 
165         try {
166             HtmlDocument document = new HtmlParser(r).HtmlDocument();
167             document.accept(new HtmlScrubber());
168             document.accept(new HtmlCollector());
169             document.accept(new HtmlDumper(System.out));
170         } finally {
171             r.close();
172         }
173     }
174 }
175