| Classes in this File | Line Coverage | Branch Coverage | Complexity | ||||
| HtmlScrubber |
|
| 4.0;4 |
| 1 | /* | |
| 2 | * HtmlScrubber.java -- cleans up HTML document tree. | |
| 3 | * Copyright (C) 1999 Quiotix Corporation. | |
| 4 | * | |
| 5 | * This program is free software; you can redistribute it and/or modify | |
| 6 | * it under the terms of the GNU General Public License, version 2, as | |
| 7 | * published by the Free Software Foundation. | |
| 8 | * | |
| 9 | * This program is distributed in the hope that it will be useful, | |
| 10 | * but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| 11 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| 12 | * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt) | |
| 13 | * for more details. | |
| 14 | */ | |
| 15 | ||
| 16 | package com.quiotix.html.parser; | |
| 17 | import java.util.Iterator; | |
| 18 | ||
| 19 | /** | |
| 20 | * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up. | |
| 21 | * It can change tags and tag attributes to uppercase or lowercase, strip | |
| 22 | * out unnecessary quotes from attribute values, and strip trailing spaces | |
| 23 | * before a newline. | |
| 24 | * | |
| 25 | * @author Brian Goetz, Quiotix | |
| 26 | * Additional contributions by: Thorsten Weber | |
| 27 | */ | |
| 28 | ||
| 29 | public class HtmlScrubber extends HtmlVisitor { | |
| 30 | ||
| 31 | /** Set tag case to upper. */ | |
| 32 | public static final int TAGS_UPCASE = 1; | |
| 33 | /** Set tag case to lower. */ | |
| 34 | public static final int TAGS_DOWNCASE = 2; | |
| 35 | /** Set attribute case to upper. */ | |
| 36 | public static final int ATTR_UPCASE = 4; | |
| 37 | /** Set attribute case to lower. */ | |
| 38 | public static final int ATTR_DOWNCASE = 8; | |
| 39 | /** Remove quotes. */ | |
| 40 | public static final int STRIP_QUOTES = 16; | |
| 41 | /** Trim spaces. */ | |
| 42 | public static final int TRIM_SPACES = 32; | |
| 43 | /** Quote attributes. */ | |
| 44 | public static final int QUOTE_ATTRS = 64; | |
| 45 | /** Defaults: downcase tags and attributes, quote attributes. */ | |
| 46 | public static final int DEFAULT_OPTIONS = | |
| 47 | TAGS_DOWNCASE | ATTR_DOWNCASE | QUOTE_ATTRS; | |
| 48 | ||
| 49 | protected int flags; | |
| 50 | protected HtmlDocument.HtmlElement previousElement; | |
| 51 | protected boolean inPreBlock; | |
| 52 | ||
| 53 | /** | |
| 54 | * Create an HtmlScrubber with the default options | |
| 55 | * (downcase tags and tag attributes, strip out unnecessary quotes). | |
| 56 | */ | |
| 57 | public HtmlScrubber() { | |
| 58 | 0 | this(DEFAULT_OPTIONS); |
| 59 | 0 | }; |
| 60 | ||
| 61 | /** | |
| 62 | * Create an HtmlScrubber with the desired set of options. | |
| 63 | * @param flags A bitmask representing the desired scrubbing options | |
| 64 | */ | |
| 65 | 4 | public HtmlScrubber(int flags) { |
| 66 | 4 | this.flags = flags; |
| 67 | 4 | }; |
| 68 | ||
| 69 | private static boolean safeToUnquote(String qs) { | |
| 70 | 0 | int upperCount=0, lowerCount=0, idCount=0; |
| 71 | ||
| 72 | 0 | for (int i=1; i < qs.length()-1; i++) { |
| 73 | 0 | char c = qs.charAt(i); |
| 74 | 0 | if (Character.isUnicodeIdentifierPart(c)) |
| 75 | 0 | ++idCount; |
| 76 | 0 | if (Character.isUpperCase(c)) |
| 77 | 0 | ++upperCount; |
| 78 | 0 | else if (Character.isLowerCase(c)) |
| 79 | 0 | ++lowerCount; |
| 80 | } | |
| 81 | 0 | return (qs.length()-2 > 0 |
| 82 | && (qs.length()-2 == idCount | |
| 83 | && (upperCount == 0 || lowerCount == 0))); | |
| 84 | } | |
| 85 | ||
| 86 | private static boolean isSingleQuoted(String s) { | |
| 87 | 4 | if (s.charAt(0) =='\'' && s.charAt(s.length()-1) == '\'') { |
| 88 | 0 | return true; |
| 89 | } | |
| 90 | 4 | else return false; |
| 91 | ||
| 92 | } | |
| 93 | private static boolean isDoubleQuoted(String s) { | |
| 94 | 4 | if (s.charAt(0) =='"' && s.charAt(s.length()-1) == '"') { |
| 95 | 0 | return true; |
| 96 | } | |
| 97 | 4 | else return false; |
| 98 | ||
| 99 | } | |
| 100 | private static boolean isQuoted(String s) { | |
| 101 | 0 | return isDoubleQuoted(s) || isSingleQuoted(s); |
| 102 | } | |
| 103 | ||
| 104 | public void start() { | |
| 105 | 4 | previousElement = null; |
| 106 | 4 | inPreBlock = false; |
| 107 | 4 | } |
| 108 | ||
| 109 | public void visit(HtmlDocument.Tag t) { | |
| 110 | 20 | if ((flags & TAGS_UPCASE) != 0) |
| 111 | 0 | t.tagName = t.tagName.toUpperCase(); |
| 112 | 20 | else if ((flags & TAGS_DOWNCASE) != 0) |
| 113 | 20 | t.tagName = t.tagName.toLowerCase(); |
| 114 | 20 | for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) { |
| 115 | 4 | HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next(); |
| 116 | 4 | if ((flags & ATTR_UPCASE) != 0) |
| 117 | 0 | a.name = a.name.toUpperCase(); |
| 118 | 4 | else if ((flags & ATTR_DOWNCASE) != 0) |
| 119 | 4 | a.name = a.name.toLowerCase(); |
| 120 | 4 | if (((flags & STRIP_QUOTES) != 0) |
| 121 | && a.hasValue | |
| 122 | && isQuoted(a.value) | |
| 123 | && safeToUnquote(a.value)) { | |
| 124 | 0 | a.value = a.value.substring(1, a.value.length()-1); |
| 125 | } | |
| 126 | 4 | if (((flags & QUOTE_ATTRS) != 0) |
| 127 | && a.hasValue) { | |
| 128 | 4 | if (!isDoubleQuoted(a.value)) { |
| 129 | 4 | if (isSingleQuoted(a.value)) { |
| 130 | 0 | a.value = a.value.substring(1, a.value.length()-1); |
| 131 | } | |
| 132 | 4 | a.value = "\"" + a.value + "\""; |
| 133 | } | |
| 134 | //System.err.println(a.value); | |
| 135 | } | |
| 136 | 4 | } |
| 137 | ||
| 138 | 20 | previousElement = t; |
| 139 | 20 | } |
| 140 | ||
| 141 | public void visit(HtmlDocument.EndTag t) { | |
| 142 | 0 | if ((flags & TAGS_UPCASE) != 0) |
| 143 | 0 | t.tagName = t.tagName.toUpperCase(); |
| 144 | 0 | else if ((flags & TAGS_DOWNCASE) != 0) |
| 145 | 0 | t.tagName = t.tagName.toLowerCase(); |
| 146 | ||
| 147 | 0 | previousElement = t; |
| 148 | 0 | } |
| 149 | ||
| 150 | public void visit(HtmlDocument.Text t) { | |
| 151 | 8 | if (((flags & TRIM_SPACES) != 0) |
| 152 | && !inPreBlock | |
| 153 | && (previousElement instanceof HtmlDocument.Newline | |
| 154 | || previousElement instanceof HtmlDocument.Tag | |
| 155 | || previousElement instanceof HtmlDocument.EndTag | |
| 156 | || previousElement instanceof HtmlDocument.Comment)) { | |
| 157 | int i; | |
| 158 | 6 | for (i=0; i<t.text.length(); i++) |
| 159 | 6 | if (t.text.charAt(i) != ' ' |
| 160 | && t.text.charAt(i) != '\t') | |
| 161 | 4 | break; |
| 162 | 4 | if (i > 0) |
| 163 | 2 | t.text = t.text.substring(i); |
| 164 | } | |
| 165 | 8 | previousElement = t; |
| 166 | 8 | } |
| 167 | ||
| 168 | 0 | public void visit(HtmlDocument.Comment c) { previousElement = c; } |
| 169 | 4 | public void visit(HtmlDocument.Newline n) { previousElement = n; } |
| 170 | 0 | public void visit(HtmlDocument.Annotation a) { previousElement = a; } |
| 171 | public void visit(HtmlDocument.TagBlock bl) { | |
| 172 | 0 | if (bl.startTag.tagName.equalsIgnoreCase("PRE") |
| 173 | || bl.startTag.tagName.equalsIgnoreCase("SCRIPT") | |
| 174 | || bl.startTag.tagName.equalsIgnoreCase("STYLE")) { | |
| 175 | 0 | inPreBlock = true; |
| 176 | 0 | super.visit(bl); |
| 177 | 0 | inPreBlock = false; |
| 178 | } | |
| 179 | else | |
| 180 | 0 | super.visit(bl); |
| 181 | 0 | } |
| 182 | } | |
| 183 | ||
| 184 |