Coverage Report - com.quiotix.html.parser.HtmlDocument
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlDocument
100%
11/11
80%
8/10
1.596
HtmlDocument$Annotation
0%
0/8
N/A
1.596
HtmlDocument$Attribute
100%
18/18
87%
7/8
1.596
HtmlDocument$AttributeList
18%
4/22
0%
0/16
1.596
HtmlDocument$Comment
0%
0/7
N/A
1.596
HtmlDocument$ElementSequence
53%
7/13
N/A
1.596
HtmlDocument$EndTag
0%
0/7
N/A
1.596
HtmlDocument$HtmlElement
100%
1/1
N/A
1.596
HtmlDocument$Newline
66%
4/6
N/A
1.596
HtmlDocument$Tag
62%
18/29
37%
3/8
1.596
HtmlDocument$TagBlock
0%
0/30
0%
0/10
1.596
HtmlDocument$Text
71%
5/7
N/A
1.596
 
 1  
 /*
 2  
  * HtmlDocument.java -- classes to represent HTML documents as parse trees.
 3  
  * Copyright (C) 1999 Quiotix Corporation.  
 4  
  *
 5  
  * This program is free software; you can redistribute it and/or modify
 6  
  * it under the terms of the GNU General Public License, version 2, as 
 7  
  * published by the Free Software Foundation.  
 8  
  *
 9  
  * This program is distributed in the hope that it will be useful,
 10  
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12  
  * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 13  
  * for more details.
 14  
  */
 15  
 
 16  
 package com.quiotix.html.parser;
 17  
 
 18  
 import java.util.ArrayList;
 19  
 import java.util.Iterator;
 20  
 import java.util.List;
 21  
 
 22  
 /**
 23  
  * Represents an HTML document as a sequence of elements.  The defined
 24  
  * element types are: Tag, EndTag, TagBlock (matched tag..end tag, with the
 25  
  * intervening elements), Comment, Text, Newline, and Annotation.
 26  
  * <p>
 27  
  * The various element types are defined as nested classes within
 28  
  * HtmlDocument.
 29  
  * </p>
 30  
  * @author Brian Goetz, Quiotix
 31  
  * @see com.quiotix.html.parser.HtmlVisitor
 32  
  */
 33  
 
 34  16
 public class HtmlDocument implements Visitable {
 35  
     ElementSequence elements;
 36  
 
 37  
     /** Constructor. */
 38  6
     public HtmlDocument(ElementSequence s) {
 39  6
         elements = s;
 40  6
     }
 41  
 
 42  
     public void accept(HtmlVisitor v) {
 43  14
         v.visit(this);
 44  14
     }
 45  
 
 46  
     private static String dequote(String s) {
 47  16
         if (s == null)
 48  6
             return "";
 49  10
         if ((s.startsWith("\"") && s.endsWith("\"")) || 
 50  
             (s.startsWith("'") && s.endsWith("'")))
 51  8
             return s.substring(1, s.length()-1);
 52  
         else
 53  2
             return s;
 54  
     }
 55  
 
 56  
     // The various elements of the HtmlDocument (Tag, EndTag, etc) are included
 57  
     // as nested subclasses largely for reasons of namespace control.
 58  
     // The following subclasses of HtmlElement exist: Tag, EndTag, Text, Comment,
 59  
     // Newline, Annotation, TagBlock.  Also, the additional classes
 60  
     // ElementSequence, Attribute, and AttributeList are defined here as well.
 61  
 
 62  
     // Each subclass of HtmlElement should have a visit() method in the
 63  
     // HtmlVisitor class.
 64  
 
 65  
     /**
 66  
      * Abstract class for HTML elements.  Enforces support for Visitors.
 67  
      */
 68  58
     public static abstract class HtmlElement implements Visitable, Sized {
 69  
         public abstract void accept(HtmlVisitor v);
 70  
     }
 71  
 
 72  
     /**
 73  
      * HTML start tag.  Stores the tag name and a list of tag attributes.
 74  
      */
 75  
     public static class Tag extends HtmlElement {
 76  
         /** The name of the tag. */
 77  
         public String tagName;
 78  
         /** A List of the tags Attributes. */
 79  
         public AttributeList attributeList;
 80  
 
 81  
         /** 
 82  
          * Whether the tag has an empty content model  
 83  
          * eg the BR and HR tags.
 84  
          */
 85  36
         public boolean emptyTag = false;
 86  
 
 87  
         /** Constructor. */
 88  36
         public Tag(String t, AttributeList a) {
 89  36
             tagName = t;
 90  36
             attributeList = a;
 91  36
         }
 92  
 
 93  
         /** Set Tag type to Empty. */
 94  
         public void setEmpty(boolean b) {
 95  0
             emptyTag = b;
 96  0
         }
 97  
 
 98  
         public void accept(HtmlVisitor v) {
 99  76
             v.visit(this);
 100  76
         }
 101  
 
 102  
         /** Whether Tag has an Attribute with given name. */
 103  
         public boolean hasAttribute(String name) {
 104  0
             return attributeList.contains(name);
 105  
         }
 106  
 
 107  
         /** 
 108  
          * Whether Tag has an Attribute with given name 
 109  
          * and that Attribute has a non-null value. 
 110  
          */
 111  
         public boolean hasAttributeValue(String name) {
 112  0
             return attributeList.hasValue(name);
 113  
         }
 114  
 
 115  
         /**
 116  
          * @return the value of the Attribute with the given name or null
 117  
          */
 118  
         public String getAttributeValue(String name) {
 119  0
             return attributeList.getValue(name);
 120  
         }
 121  
 
 122  
         public int getLength() {
 123  0
             int length = 0;
 124  0
             for (Iterator iterator = attributeList.attributes.iterator(); iterator.hasNext();) {
 125  0
                 Attribute attribute = (Attribute) iterator.next();
 126  0
                 length += 1 + (attribute.getLength());
 127  0
             }
 128  0
             return length + tagName.length() + 2 + (emptyTag ? 1 : 0);
 129  
         }
 130  
 
 131  
         public String toString() {
 132  20
             StringBuffer s = new StringBuffer();
 133  20
             s.append("<");
 134  20
             s.append(tagName);
 135  20
             for (Iterator iterator = attributeList.attributes.iterator(); iterator.hasNext();) {
 136  4
                 Attribute attribute = (Attribute) iterator.next();
 137  4
                 s.append(" ");
 138  4
                 s.append(attribute.toString());
 139  4
             }
 140  20
             if (emptyTag) s.append("/");
 141  20
             s.append(">");
 142  20
             return s.toString();
 143  
         }
 144  
     }
 145  
 
 146  
     /**
 147  
      * Html end tag.  Stores only the tag name.
 148  
      */
 149  
     public static class EndTag extends HtmlElement {
 150  
 
 151  
         /** The name of the Tag. */
 152  
         public String tagName;
 153  
 
 154  
         /** Constructor. */
 155  0
         public EndTag(String t) {
 156  0
             tagName = t;
 157  0
         }
 158  
 
 159  
         public void accept(HtmlVisitor v) {
 160  0
             v.visit(this);
 161  0
         }
 162  
 
 163  
         public int getLength() {
 164  0
             return 3 + tagName.length();
 165  
         }
 166  
 
 167  
         public String toString() {
 168  0
             return "</" + tagName + ">";
 169  
         }
 170  
     }
 171  
 
 172  
     /**
 173  
      * A tag block is a composite structure consisting of a start tag
 174  
      * a sequence of HTML elements, and a matching end tag.
 175  
      */
 176  
     public static class TagBlock extends HtmlElement {
 177  
         /** Tag at start of Block.*/
 178  
         public Tag startTag;
 179  
         /** Tag at end of Block.*/
 180  
         public EndTag endTag;
 181  
         /** The sequance of elements which make up the body.*/
 182  
         public ElementSequence body;
 183  
 
 184  
         /** Constructor. */
 185  0
         public TagBlock(String name, AttributeList aList, ElementSequence b) {
 186  0
             startTag = new Tag(name, aList);
 187  0
             endTag = new EndTag(name);
 188  0
             body = b;
 189  0
         }
 190  
 
 191  
         public void accept(HtmlVisitor v) {
 192  0
             v.visit(this);
 193  0
         }
 194  
         
 195  
         public int getLength() { 
 196  0
             int bodyLength = 0;
 197  0
             for (Iterator iterator = body.iterator(); iterator.hasNext();) {
 198  0
                 HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
 199  0
                 bodyLength += htmlElement.getLength();    
 200  0
             }
 201  0
             return startTag.getLength() + bodyLength + endTag.getLength();
 202  
         }
 203  
         
 204  
         public String toString() {
 205  0
           StringBuffer sb = new StringBuffer();
 206  0
           sb.append(startTag.toString());
 207  0
           for (Iterator iterator = body.iterator(); iterator.hasNext();) {
 208  0
             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
 209  0
             sb.append(htmlElement.toString());
 210  0
           }
 211  0
           sb.append(endTag.toString());
 212  0
           return sb.toString();
 213  
         }
 214  
         
 215  
         /**
 216  
          * @return the text within a tag block
 217  
          */
 218  
         public String text() {
 219  0
           StringBuffer sb = new StringBuffer();
 220  0
           for (Iterator iterator = body.iterator(); iterator.hasNext();) {
 221  0
             HtmlDocument.HtmlElement htmlElement = (HtmlDocument.HtmlElement) iterator.next();
 222  0
             if (htmlElement instanceof Text) {
 223  0
               sb.append(htmlElement.toString());
 224  0
             } else if(htmlElement instanceof TagBlock)
 225  0
               sb.append(((TagBlock)htmlElement).text());
 226  0
           }
 227  0
           return sb.toString();
 228  
         }
 229  
     }
 230  
 
 231  
     /**
 232  
      * HTML comments.
 233  
      */
 234  
     public static class Comment extends HtmlElement {
 235  
         /**
 236  
          * Note that a Comment starts and ends with two hyphen characters. 
 237  
          */
 238  
         public String comment;
 239  
 
 240  
         /** Constructor. */
 241  0
         public Comment(String c) {
 242  0
             comment = c;
 243  0
         }
 244  
 
 245  
         public void accept(HtmlVisitor v) {
 246  0
             v.visit(this);
 247  0
         }
 248  
 
 249  
         public int getLength() {
 250  0
             return 3 + comment.length();
 251  
         }
 252  
 
 253  
         public String toString() {
 254  0
             return "<!" + comment + ">";
 255  
         }
 256  
     }
 257  
 
 258  
     /**
 259  
      * Plain text
 260  
      */
 261  
     public static class Text extends HtmlElement {
 262  
         /** The text. */
 263  
         public String text;
 264  
 
 265  
         /** Constructor. */
 266  14
         public Text(String t) {
 267  14
             text = t;
 268  14
         }
 269  
 
 270  
         public void accept(HtmlVisitor v) {
 271  30
             v.visit(this);
 272  30
         }
 273  
 
 274  
         public int getLength() {
 275  0
             return text.length();
 276  
         }
 277  
 
 278  
         public String toString() {
 279  0
             return text;
 280  
         }
 281  
     }
 282  
 
 283  
     /**
 284  
      * End of line indicator.
 285  
      */
 286  8
     public static class Newline extends HtmlElement {
 287  
         /** The system specific newline String. */
 288  2
         public static final String NL = System.getProperty("line.separator");
 289  
 
 290  
         public void accept(HtmlVisitor v) {
 291  16
             v.visit(this);
 292  16
         }
 293  
 
 294  
         public int getLength() {
 295  0
             return NL.length();
 296  
         }
 297  
 
 298  
         public String toString() {
 299  0
             return NL;
 300  
         }
 301  
     }
 302  
 
 303  
     /**
 304  
      * A sequence of HTML elements.
 305  
      */
 306  
     public static class ElementSequence {
 307  
         private List elements;
 308  
 
 309  
         /** Constructor. */
 310  0
         public ElementSequence(int n) {
 311  0
             elements = new ArrayList(n);
 312  0
         }
 313  
 
 314  
         /** Constructor. */
 315  6
         public ElementSequence() {
 316  6
             elements = new ArrayList();
 317  6
         }
 318  
 
 319  
         /** Add element to list. */
 320  
         public void addElement(HtmlElement o) {
 321  58
             elements.add(o);
 322  58
         }
 323  
 
 324  
         /**
 325  
          * @return the number of elements in this list.
 326  
          */
 327  
         public int size() {
 328  4
             return elements.size();
 329  
         }
 330  
 
 331  
         /**
 332  
          * @return an iterator over the elements in this list in proper sequence.
 333  
          */
 334  
         public Iterator iterator() {
 335  14
             return elements.iterator();
 336  
         }
 337  
 
 338  
         /**
 339  
          * Clear current elements and replace with given Collection.
 340  
          * 
 341  
          * @param collection to replace elements with
 342  
          */
 343  
         public void setElements(List collection) {
 344  0
             elements.clear();
 345  0
             elements.addAll(collection);
 346  0
         }
 347  
     }
 348  
 
 349  
     /**
 350  
      * Annotations.  These are not part of the HTML document, but
 351  
      * provide a way for HTML-processing applications to insert
 352  
      * annotations into the document.  These annotations can be used by
 353  
      * other programs or can be brought to the user's attention at a
 354  
      * later time.  For example, the HtmlCollector might insert an
 355  
      * annotation to indicate that there is no corresponding start tag
 356  
      * for an end tag.
 357  
      */
 358  
     public static class Annotation extends HtmlElement {
 359  
         String type, text;
 360  
 
 361  
         /** Constructor. */
 362  0
         public Annotation(String type, String text) {
 363  0
             this.type = type;
 364  0
             this.text = text;
 365  0
         }
 366  
 
 367  
         public void accept(HtmlVisitor v) {
 368  0
             v.visit(this);
 369  0
         }
 370  
 
 371  
         public int getLength() {
 372  0
             return 14 + type.length() + text.length();
 373  
         }
 374  
 
 375  
         public String toString() {
 376  0
             return "<!--NOTE(" + type + ") " + text + "-->";
 377  
         }
 378  
     }
 379  
 
 380  
     /**
 381  
      * A Tag Attribute.
 382  
      */
 383  
     public static class Attribute implements Sized {
 384  
         /** The name of this Attribute. */
 385  
         public String name;
 386  
         /** The value of this Attribute, including any surrounding quotes. */
 387  
         public String value;
 388  
         /** Whether the Attribute has a value. */
 389  
         public boolean hasValue;
 390  
 
 391  
         /** Constructor. */
 392  2
         public Attribute(String n) {
 393  2
             name = n;
 394  2
             hasValue = false;
 395  2
         }
 396  
 
 397  
         /** Constructor. */
 398  10
         public Attribute(String n, String v) {
 399  10
             name = n;
 400  10
             if (v != null) {
 401  10
                 value = v;
 402  10
                 hasValue = true;
 403  
             }
 404  10
         }
 405  
 
 406  
         /** 
 407  
          * Whether quotes are included is dependant upon the source document.
 408  
          * 
 409  
          * {@inheritDoc}
 410  
          * @see com.quiotix.html.parser.Sized#getLength()
 411  
          */
 412  
         public int getLength() {
 413  12
             return (hasValue ? name.length() + 1 + value.length() : name.length());
 414  
         }
 415  
 
 416  
         public String toString() {
 417  20
             return (hasValue ? name + "=" + value : name);
 418  
         }
 419  
         
 420  
         /**
 421  
          * @return the value with quotes removed
 422  
          */
 423  
         public String getValue() { 
 424  16
             return dequote(value);
 425  
         }
 426  
         
 427  
         /**
 428  
          * @param v the value to set, may be null
 429  
          */
 430  
         public void setValue(String v) {
 431  10
             value = v;
 432  10
             if (v == null)  
 433  4
                 hasValue = false;
 434  
             else 
 435  6
                 hasValue = true;
 436  10
         }
 437  
     }
 438  
 
 439  
     /**
 440  
      * A List of Attributes.
 441  
      */
 442  36
     public static class AttributeList {
 443  
         /** The backing List. */
 444  36
         public List attributes = new ArrayList();
 445  
 
 446  
         /** Add. */
 447  
         public void addAttribute(Attribute a) {
 448  10
             attributes.add(a);
 449  10
         }
 450  
 
 451  
         /** Whether the List contains an Attribute with the given name. */
 452  
         public boolean contains(String name) {
 453  0
             for (Iterator iterator = attributes.iterator(); iterator.hasNext();) {
 454  0
                 Attribute attribute = (Attribute) iterator.next();
 455  0
                 if (attribute.name.equalsIgnoreCase(name))
 456  0
                     return true;
 457  0
             }
 458  0
             return false;
 459  
         }
 460  
 
 461  
         /** 
 462  
          * Whether the List contains an Attribute with the given name 
 463  
          * and that Attribute has a non-null value. 
 464  
          */
 465  
         public boolean hasValue(String name) {
 466  0
             for (Iterator iterator = attributes.iterator(); iterator.hasNext();) {
 467  0
                 Attribute attribute = (Attribute) iterator.next();
 468  0
                 if (attribute.name.equalsIgnoreCase(name) && attribute.hasValue)
 469  0
                     return true;
 470  0
             }
 471  0
             return false;
 472  
         }
 473  
 
 474  
         /**
 475  
          * @param name the name of the Attribute
 476  
          * @return the value of the Attribute with the given name or null
 477  
          */
 478  
         public String getValue(String name) {
 479  0
             for (Iterator iterator = attributes.iterator(); iterator.hasNext();) {
 480  0
                 Attribute attribute = (Attribute) iterator.next();
 481  0
                 if (attribute.name.equalsIgnoreCase(name) && attribute.hasValue)
 482  0
                     return dequote(attribute.value);
 483  0
             }
 484  0
             return null;
 485  
         }
 486  
     }
 487  
 }
 488  
 
 489  
 
 490