View Javadoc

1   /*
2    * HtmlScrubber.java -- cleans up HTML document tree.  
3    * Copyright (C) 1999 Quiotix Corporation.  
4    *
5    * This program is free software; you can redistribute it and/or modify
6    * it under the terms of the GNU General Public License, version 2, as 
7    * published by the Free Software Foundation.  
8    *
9    * This program is distributed in the hope that it will be useful,
10   * but WITHOUT ANY WARRANTY; without even the implied warranty of
11   * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12   * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
13   * for more details.
14   */
15  
16  package com.quiotix.html.parser;
17  import java.util.Iterator;
18  
19  /**
20   * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
21   * It can change tags and tag attributes to uppercase or lowercase, strip
22   * out unnecessary quotes from attribute values, and strip trailing spaces
23   * before a newline.
24   *
25   * @author Brian Goetz, Quiotix
26   * Additional contributions by: Thorsten Weber
27   */
28  
29  public class HtmlScrubber extends HtmlVisitor {
30  
31      /** Set tag case to upper. */
32      public static final int TAGS_UPCASE     = 1;
33      /** Set tag case to lower. */
34      public static final int TAGS_DOWNCASE   = 2;
35      /** Set attribute case to upper. */
36      public static final int ATTR_UPCASE     = 4;
37      /** Set attribute case to lower. */
38      public static final int ATTR_DOWNCASE   = 8;
39      /** Remove quotes. */
40      public static final int STRIP_QUOTES    = 16;
41      /** Trim spaces. */
42      public static final int TRIM_SPACES     = 32;
43      /** Quote attributes. */
44      public static final int QUOTE_ATTRS     = 64;
45      /** Defaults: downcase tags and attributes, quote attributes. */
46      public static final int DEFAULT_OPTIONS =
47              TAGS_DOWNCASE | ATTR_DOWNCASE | QUOTE_ATTRS;
48  
49      protected int flags;
50      protected HtmlDocument.HtmlElement previousElement;
51      protected boolean inPreBlock;
52  
53      /** 
54       * Create an HtmlScrubber with the default options 
55       * (downcase tags and tag attributes, strip out unnecessary quotes).
56       */
57      public HtmlScrubber() {
58          this(DEFAULT_OPTIONS);
59      };
60  
61      /** 
62       * Create an HtmlScrubber with the desired set of options.
63       * @param flags A bitmask representing the desired scrubbing options
64       */
65      public HtmlScrubber(int flags) {
66          this.flags = flags;
67      };
68  
69      private static boolean safeToUnquote(String qs) {
70          int upperCount=0, lowerCount=0, idCount=0;
71  
72          for (int i=1; i < qs.length()-1; i++) {
73              char c = qs.charAt(i);
74              if (Character.isUnicodeIdentifierPart(c))
75                  ++idCount;
76              if (Character.isUpperCase(c))
77                  ++upperCount;
78              else if (Character.isLowerCase(c))
79                  ++lowerCount;
80          }
81          return (qs.length()-2 > 0
82                  && (qs.length()-2 == idCount
83                  && (upperCount == 0 || lowerCount == 0)));
84      }
85  
86      private static boolean isSingleQuoted(String s) {
87        if (s.charAt(0) =='\'' && s.charAt(s.length()-1) == '\'') {
88          return true;
89        }
90        else return false;
91          
92      }
93      private static boolean isDoubleQuoted(String s) {
94        if (s.charAt(0) =='"' && s.charAt(s.length()-1) == '"') {
95          return true;
96        }
97        else return false;
98          
99      }
100     private static boolean isQuoted(String s) {
101       return isDoubleQuoted(s) || isSingleQuoted(s);
102     }
103 
104     public void start() {
105         previousElement = null;
106         inPreBlock = false;
107     }
108 
109     public void visit(HtmlDocument.Tag t) {
110         if ((flags & TAGS_UPCASE) != 0)
111             t.tagName = t.tagName.toUpperCase();
112         else if ((flags & TAGS_DOWNCASE) != 0)
113             t.tagName = t.tagName.toLowerCase();
114         for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
115             HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
116             if ((flags & ATTR_UPCASE) != 0)
117                 a.name = a.name.toUpperCase();
118             else if ((flags & ATTR_DOWNCASE) != 0)
119                 a.name = a.name.toLowerCase();
120             if (((flags & STRIP_QUOTES) != 0)
121                 && a.hasValue
122                 && isQuoted(a.value)
123                 && safeToUnquote(a.value)) {
124               a.value = a.value.substring(1, a.value.length()-1);
125             }
126             if (((flags & QUOTE_ATTRS) != 0)
127                 && a.hasValue) {
128               if (!isDoubleQuoted(a.value)) {
129                 if (isSingleQuoted(a.value)) {
130                   a.value = a.value.substring(1, a.value.length()-1);
131                 } 
132                 a.value = "\"" + a.value + "\"";
133               }
134               //System.err.println(a.value);
135             }
136         }
137 
138         previousElement = t;
139     }
140 
141     public void visit(HtmlDocument.EndTag t) {
142         if ((flags & TAGS_UPCASE) != 0)
143             t.tagName = t.tagName.toUpperCase();
144         else if ((flags & TAGS_DOWNCASE) != 0)
145             t.tagName = t.tagName.toLowerCase();
146 
147         previousElement = t;
148     }
149 
150     public void visit(HtmlDocument.Text t) {
151         if (((flags & TRIM_SPACES) != 0)
152                 && !inPreBlock
153                 && (previousElement instanceof HtmlDocument.Newline
154                 || previousElement instanceof HtmlDocument.Tag
155                 || previousElement instanceof HtmlDocument.EndTag
156                 || previousElement instanceof HtmlDocument.Comment)) {
157             int i;
158             for (i=0; i<t.text.length(); i++)
159                 if (t.text.charAt(i) != ' '
160                         && t.text.charAt(i) != '\t')
161                     break;
162             if (i > 0)
163                 t.text = t.text.substring(i);
164         }
165         previousElement = t;
166     }
167 
168     public void visit(HtmlDocument.Comment c)     { previousElement = c; }
169     public void visit(HtmlDocument.Newline n)     { previousElement = n; }
170     public void visit(HtmlDocument.Annotation a)  { previousElement = a; }
171     public void visit(HtmlDocument.TagBlock bl) {
172         if (bl.startTag.tagName.equalsIgnoreCase("PRE")
173                 || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
174                 || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
175             inPreBlock = true;
176             super.visit(bl);
177             inPreBlock = false;
178         }
179         else
180             super.visit(bl);
181     }
182 }
183 
184