Coverage Report - com.quiotix.html.parser.HtmlCollector
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlCollector
46%
29/62
31%
5/16
1.667
HtmlCollector$1
N/A
N/A
1.667
HtmlCollector$ElementStack
66%
4/6
N/A
1.667
HtmlCollector$TagStackEntry
100%
1/1
N/A
1.667
 
 1  
 /*
 2  
  * HtmlCollector.java -- structures an HTML document tree.  
 3  
  * Copyright (C) 1999 Quiotix Corporation.  
 4  
  *
 5  
  * This program is free software; you can redistribute it and/or modify
 6  
  * it under the terms of the GNU General Public License, version 2, as 
 7  
  * published by the Free Software Foundation.  
 8  
  *
 9  
  * This program is distributed in the hope that it will be useful,
 10  
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12  
  * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 13  
  * for more details.
 14  
  */
 15  
 
 16  
 package com.quiotix.html.parser;
 17  
 
 18  
 import java.io.FileInputStream;
 19  
 import java.io.InputStream;
 20  
 import java.util.HashSet;
 21  
 import java.util.Iterator;
 22  
 import java.util.Set;
 23  
 import java.util.Vector;
 24  
 
 25  
 /**
 26  
  * An HtmlVisitor which modifies the structure of the document so that
 27  
  * begin tags are matched properly with end tags and placed in TagBlock
 28  
  * elements.  Typically, an HtmlDocument is created by the parser, which
 29  
  * simply returns a flat list of elements.  The HtmlCollector takes this
 30  
  * flat list and gives it the structure that is implied by the HTML content.
 31  
  *
 32  
  * @author Brian Goetz, Quiotix
 33  
  */
 34  
 
 35  4
 public class HtmlCollector extends HtmlVisitor {
 36  
 
 37  4
     protected ElementStack tagStack = new ElementStack();
 38  
     protected ElementStack elements;
 39  
     protected boolean collected;
 40  2
     protected static Set dontMatch = new HashSet();
 41  2
     protected static String[] dontMatchStrings
 42  
     = {"AREA", "BASE", "BASEFONT", "BR", "COL", "HR", "IMG", "INPUT",
 43  
        "ISINDEX", "LINK", "META", "PARAM"};
 44  
 
 45  
     static {
 46  26
         for (int i = 0; i < dontMatchStrings.length; i++)
 47  24
             dontMatch.add(dontMatchStrings[i]);
 48  2
     }
 49  
 
 50  40
     private static class TagStackEntry {
 51  
         String tagName;
 52  
         int index;
 53  
     }
 54  
 
 55  4
     private static class ElementStack extends Vector {
 56  
       private static final long serialVersionUID = 3718394150667677113L;
 57  
 
 58  
         ElementStack() {
 59  4
             super();
 60  4
         }
 61  
 
 62  
         ElementStack(int n) {
 63  4
             super(n);
 64  4
         }
 65  
 
 66  
         void popN(int n) {
 67  0
             elementCount -= n;
 68  0
         }
 69  
     }
 70  
 
 71  
     protected int pushNode(HtmlDocument.HtmlElement e) {
 72  32
         elements.addElement(e);
 73  32
         return elements.size() - 1;
 74  
     }
 75  
 
 76  
     public void visit(HtmlDocument.Comment c) {
 77  0
         pushNode(c);
 78  0
     }
 79  
 
 80  
     public void visit(HtmlDocument.Text t) {
 81  8
         pushNode(t);
 82  8
     }
 83  
 
 84  
     public void visit(HtmlDocument.Newline n) {
 85  4
         pushNode(n);
 86  4
     }
 87  
 
 88  
     public void visit(HtmlDocument.Tag t) {
 89  20
         TagStackEntry ts = new TagStackEntry();
 90  
         int index;
 91  
 
 92  
         // Push the tag onto the element stack, and push an entry on the tag
 93  
         // stack if it's a tag we care about matching
 94  20
         index = pushNode(t);
 95  20
         if (!t.emptyTag
 96  
                 && !dontMatch.contains(t.tagName.toUpperCase())) {
 97  20
             ts.tagName = t.tagName;
 98  20
             ts.index = index;
 99  20
             tagStack.addElement(ts);
 100  
         }
 101  20
     }
 102  
 
 103  
     public void visit(HtmlDocument.EndTag t) {
 104  
         int i;
 105  0
         for (i = tagStack.size() - 1; i >= 0; i--) {
 106  0
             TagStackEntry ts = (TagStackEntry) tagStack.elementAt(i);
 107  0
             if (t.tagName.equalsIgnoreCase(ts.tagName)) {
 108  
                 HtmlDocument.TagBlock block;
 109  
                 HtmlDocument.ElementSequence blockElements;
 110  
                 HtmlDocument.Tag tag;
 111  
 
 112  
                 // Create a new ElementSequence and copy the elements to it
 113  0
                 blockElements =
 114  
                         new HtmlDocument.ElementSequence(elements.size() - ts.index - 1);
 115  0
                 for (int j = ts.index + 1; j < elements.size(); j++)
 116  0
                     blockElements.addElement((HtmlDocument.HtmlElement)
 117  
                             elements.elementAt(j));
 118  0
                 tag = (HtmlDocument.Tag) elements.elementAt(ts.index);
 119  0
                 block = new HtmlDocument.TagBlock(tag.tagName,
 120  
                         tag.attributeList, blockElements);
 121  
 
 122  
                 // Pop the elements off the stack, push the new block
 123  0
                 elements.popN(elements.size() - ts.index);
 124  0
                 elements.addElement(block);
 125  
 
 126  
                 // Pop the matched tag and intervening unmatched tags
 127  0
                 tagStack.popN(tagStack.size() - i);
 128  
 
 129  0
                 collected = true;
 130  0
                 break;
 131  
             }
 132  
         }
 133  
 
 134  
         // If we didn't find a match, just push the end tag
 135  0
         if (i < 0)
 136  0
             pushNode(t);
 137  0
     }
 138  
 
 139  
     public void visit(HtmlDocument.TagBlock bl) {
 140  0
         HtmlCollector c = new HtmlCollector();
 141  
 
 142  0
         c.start();
 143  0
         c.visit(bl.body);
 144  0
         c.finish();
 145  0
         pushNode(bl);
 146  0
     }
 147  
 
 148  
     public void visit(HtmlDocument.ElementSequence s) {
 149  4
         elements = new ElementStack(s.size());
 150  4
         collected = false;
 151  
 
 152  4
         for (Iterator iterator = s.iterator(); iterator.hasNext();) {
 153  32
             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
 154  32
             htmlElement.accept(this);
 155  32
         }
 156  4
         if (collected)
 157  0
             s.setElements(elements);
 158  4
     }
 159  
 
 160  
     /**
 161  
      * Runnable.
 162  
      */
 163  
     public static void main(String[] args) throws Exception {
 164  0
         InputStream r = new FileInputStream(args[0]);
 165  
 
 166  
         try {
 167  0
             HtmlDocument document = new HtmlParser(r).HtmlDocument();
 168  0
             document.accept(new HtmlScrubber());
 169  0
             document.accept(new HtmlCollector());
 170  0
             document.accept(new HtmlDumper(System.out));
 171  
         } finally {
 172  0
             r.close();
 173  0
         }
 174  0
     }
 175  
 }
 176