Coverage Report - com.quiotix.html.parser.HtmlScrubber
 
Classes in this File Line Coverage Branch Coverage Complexity
HtmlScrubber
51%
35/68
28%
24/84
4
 
 1  
 /*
 2  
  * HtmlScrubber.java -- cleans up HTML document tree.  
 3  
  * Copyright (C) 1999 Quiotix Corporation.  
 4  
  *
 5  
  * This program is free software; you can redistribute it and/or modify
 6  
  * it under the terms of the GNU General Public License, version 2, as 
 7  
  * published by the Free Software Foundation.  
 8  
  *
 9  
  * This program is distributed in the hope that it will be useful,
 10  
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 11  
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12  
  * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
 13  
  * for more details.
 14  
  */
 15  
 
 16  
 package com.quiotix.html.parser;
 17  
 import java.util.Iterator;
 18  
 
 19  
 /**
 20  
  * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
 21  
  * It can change tags and tag attributes to uppercase or lowercase, strip
 22  
  * out unnecessary quotes from attribute values, and strip trailing spaces
 23  
  * before a newline.
 24  
  *
 25  
  * @author Brian Goetz, Quiotix
 26  
  * Additional contributions by: Thorsten Weber
 27  
  */
 28  
 
 29  
 public class HtmlScrubber extends HtmlVisitor {
 30  
 
 31  
     /** Set tag case to upper. */
 32  
     public static final int TAGS_UPCASE     = 1;
 33  
     /** Set tag case to lower. */
 34  
     public static final int TAGS_DOWNCASE   = 2;
 35  
     /** Set attribute case to upper. */
 36  
     public static final int ATTR_UPCASE     = 4;
 37  
     /** Set attribute case to lower. */
 38  
     public static final int ATTR_DOWNCASE   = 8;
 39  
     /** Remove quotes. */
 40  
     public static final int STRIP_QUOTES    = 16;
 41  
     /** Trim spaces. */
 42  
     public static final int TRIM_SPACES     = 32;
 43  
     /** Quote attributes. */
 44  
     public static final int QUOTE_ATTRS     = 64;
 45  
     /** Defaults: downcase tags and attributes, quote attributes. */
 46  
     public static final int DEFAULT_OPTIONS =
 47  
             TAGS_DOWNCASE | ATTR_DOWNCASE | QUOTE_ATTRS;
 48  
 
 49  
     protected int flags;
 50  
     protected HtmlDocument.HtmlElement previousElement;
 51  
     protected boolean inPreBlock;
 52  
 
 53  
     /** 
 54  
      * Create an HtmlScrubber with the default options 
 55  
      * (downcase tags and tag attributes, strip out unnecessary quotes).
 56  
      */
 57  
     public HtmlScrubber() {
 58  0
         this(DEFAULT_OPTIONS);
 59  0
     };
 60  
 
 61  
     /** 
 62  
      * Create an HtmlScrubber with the desired set of options.
 63  
      * @param flags A bitmask representing the desired scrubbing options
 64  
      */
 65  4
     public HtmlScrubber(int flags) {
 66  4
         this.flags = flags;
 67  4
     };
 68  
 
 69  
     private static boolean safeToUnquote(String qs) {
 70  0
         int upperCount=0, lowerCount=0, idCount=0;
 71  
 
 72  0
         for (int i=1; i < qs.length()-1; i++) {
 73  0
             char c = qs.charAt(i);
 74  0
             if (Character.isUnicodeIdentifierPart(c))
 75  0
                 ++idCount;
 76  0
             if (Character.isUpperCase(c))
 77  0
                 ++upperCount;
 78  0
             else if (Character.isLowerCase(c))
 79  0
                 ++lowerCount;
 80  
         }
 81  0
         return (qs.length()-2 > 0
 82  
                 && (qs.length()-2 == idCount
 83  
                 && (upperCount == 0 || lowerCount == 0)));
 84  
     }
 85  
 
 86  
     private static boolean isSingleQuoted(String s) {
 87  4
       if (s.charAt(0) =='\'' && s.charAt(s.length()-1) == '\'') {
 88  0
         return true;
 89  
       }
 90  4
       else return false;
 91  
         
 92  
     }
 93  
     private static boolean isDoubleQuoted(String s) {
 94  4
       if (s.charAt(0) =='"' && s.charAt(s.length()-1) == '"') {
 95  0
         return true;
 96  
       }
 97  4
       else return false;
 98  
         
 99  
     }
 100  
     private static boolean isQuoted(String s) {
 101  0
       return isDoubleQuoted(s) || isSingleQuoted(s);
 102  
     }
 103  
 
 104  
     public void start() {
 105  4
         previousElement = null;
 106  4
         inPreBlock = false;
 107  4
     }
 108  
 
 109  
     public void visit(HtmlDocument.Tag t) {
 110  20
         if ((flags & TAGS_UPCASE) != 0)
 111  0
             t.tagName = t.tagName.toUpperCase();
 112  20
         else if ((flags & TAGS_DOWNCASE) != 0)
 113  20
             t.tagName = t.tagName.toLowerCase();
 114  20
         for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
 115  4
             HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
 116  4
             if ((flags & ATTR_UPCASE) != 0)
 117  0
                 a.name = a.name.toUpperCase();
 118  4
             else if ((flags & ATTR_DOWNCASE) != 0)
 119  4
                 a.name = a.name.toLowerCase();
 120  4
             if (((flags & STRIP_QUOTES) != 0)
 121  
                 && a.hasValue
 122  
                 && isQuoted(a.value)
 123  
                 && safeToUnquote(a.value)) {
 124  0
               a.value = a.value.substring(1, a.value.length()-1);
 125  
             }
 126  4
             if (((flags & QUOTE_ATTRS) != 0)
 127  
                 && a.hasValue) {
 128  4
               if (!isDoubleQuoted(a.value)) {
 129  4
                 if (isSingleQuoted(a.value)) {
 130  0
                   a.value = a.value.substring(1, a.value.length()-1);
 131  
                 } 
 132  4
                 a.value = "\"" + a.value + "\"";
 133  
               }
 134  
               //System.err.println(a.value);
 135  
             }
 136  4
         }
 137  
 
 138  20
         previousElement = t;
 139  20
     }
 140  
 
 141  
     public void visit(HtmlDocument.EndTag t) {
 142  0
         if ((flags & TAGS_UPCASE) != 0)
 143  0
             t.tagName = t.tagName.toUpperCase();
 144  0
         else if ((flags & TAGS_DOWNCASE) != 0)
 145  0
             t.tagName = t.tagName.toLowerCase();
 146  
 
 147  0
         previousElement = t;
 148  0
     }
 149  
 
 150  
     public void visit(HtmlDocument.Text t) {
 151  8
         if (((flags & TRIM_SPACES) != 0)
 152  
                 && !inPreBlock
 153  
                 && (previousElement instanceof HtmlDocument.Newline
 154  
                 || previousElement instanceof HtmlDocument.Tag
 155  
                 || previousElement instanceof HtmlDocument.EndTag
 156  
                 || previousElement instanceof HtmlDocument.Comment)) {
 157  
             int i;
 158  6
             for (i=0; i<t.text.length(); i++)
 159  6
                 if (t.text.charAt(i) != ' '
 160  
                         && t.text.charAt(i) != '\t')
 161  4
                     break;
 162  4
             if (i > 0)
 163  2
                 t.text = t.text.substring(i);
 164  
         }
 165  8
         previousElement = t;
 166  8
     }
 167  
 
 168  0
     public void visit(HtmlDocument.Comment c)     { previousElement = c; }
 169  4
     public void visit(HtmlDocument.Newline n)     { previousElement = n; }
 170  0
     public void visit(HtmlDocument.Annotation a)  { previousElement = a; }
 171  
     public void visit(HtmlDocument.TagBlock bl) {
 172  0
         if (bl.startTag.tagName.equalsIgnoreCase("PRE")
 173  
                 || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
 174  
                 || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
 175  0
             inPreBlock = true;
 176  0
             super.visit(bl);
 177  0
             inPreBlock = false;
 178  
         }
 179  
         else
 180  0
             super.visit(bl);
 181  0
     }
 182  
 }
 183  
 
 184