1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16 package com.quiotix.html.parser;
17 import java.util.Iterator;
18
19
20
21
22
23
24
25
26
27
28
29 public class HtmlScrubber extends HtmlVisitor {
30
31
32 public static final int TAGS_UPCASE = 1;
33
34 public static final int TAGS_DOWNCASE = 2;
35
36 public static final int ATTR_UPCASE = 4;
37
38 public static final int ATTR_DOWNCASE = 8;
39
40 public static final int STRIP_QUOTES = 16;
41
42 public static final int TRIM_SPACES = 32;
43
44 public static final int QUOTE_ATTRS = 64;
45
46 public static final int DEFAULT_OPTIONS =
47 TAGS_DOWNCASE | ATTR_DOWNCASE | QUOTE_ATTRS;
48
49 protected int flags;
50 protected HtmlDocument.HtmlElement previousElement;
51 protected boolean inPreBlock;
52
53
54
55
56
57 public HtmlScrubber() {
58 this(DEFAULT_OPTIONS);
59 };
60
61
62
63
64
65 public HtmlScrubber(int flags) {
66 this.flags = flags;
67 };
68
69 private static boolean safeToUnquote(String qs) {
70 int upperCount=0, lowerCount=0, idCount=0;
71
72 for (int i=1; i < qs.length()-1; i++) {
73 char c = qs.charAt(i);
74 if (Character.isUnicodeIdentifierPart(c))
75 ++idCount;
76 if (Character.isUpperCase(c))
77 ++upperCount;
78 else if (Character.isLowerCase(c))
79 ++lowerCount;
80 }
81 return (qs.length()-2 > 0
82 && (qs.length()-2 == idCount
83 && (upperCount == 0 || lowerCount == 0)));
84 }
85
86 private static boolean isSingleQuoted(String s) {
87 if (s.charAt(0) =='\'' && s.charAt(s.length()-1) == '\'') {
88 return true;
89 }
90 else return false;
91
92 }
93 private static boolean isDoubleQuoted(String s) {
94 if (s.charAt(0) =='"' && s.charAt(s.length()-1) == '"') {
95 return true;
96 }
97 else return false;
98
99 }
100 private static boolean isQuoted(String s) {
101 return isDoubleQuoted(s) || isSingleQuoted(s);
102 }
103
104 public void start() {
105 previousElement = null;
106 inPreBlock = false;
107 }
108
109 public void visit(HtmlDocument.Tag t) {
110 if ((flags & TAGS_UPCASE) != 0)
111 t.tagName = t.tagName.toUpperCase();
112 else if ((flags & TAGS_DOWNCASE) != 0)
113 t.tagName = t.tagName.toLowerCase();
114 for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
115 HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
116 if ((flags & ATTR_UPCASE) != 0)
117 a.name = a.name.toUpperCase();
118 else if ((flags & ATTR_DOWNCASE) != 0)
119 a.name = a.name.toLowerCase();
120 if (((flags & STRIP_QUOTES) != 0)
121 && a.hasValue
122 && isQuoted(a.value)
123 && safeToUnquote(a.value)) {
124 a.value = a.value.substring(1, a.value.length()-1);
125 }
126 if (((flags & QUOTE_ATTRS) != 0)
127 && a.hasValue) {
128 if (!isDoubleQuoted(a.value)) {
129 if (isSingleQuoted(a.value)) {
130 a.value = a.value.substring(1, a.value.length()-1);
131 }
132 a.value = "\"" + a.value + "\"";
133 }
134
135 }
136 }
137
138 previousElement = t;
139 }
140
141 public void visit(HtmlDocument.EndTag t) {
142 if ((flags & TAGS_UPCASE) != 0)
143 t.tagName = t.tagName.toUpperCase();
144 else if ((flags & TAGS_DOWNCASE) != 0)
145 t.tagName = t.tagName.toLowerCase();
146
147 previousElement = t;
148 }
149
150 public void visit(HtmlDocument.Text t) {
151 if (((flags & TRIM_SPACES) != 0)
152 && !inPreBlock
153 && (previousElement instanceof HtmlDocument.Newline
154 || previousElement instanceof HtmlDocument.Tag
155 || previousElement instanceof HtmlDocument.EndTag
156 || previousElement instanceof HtmlDocument.Comment)) {
157 int i;
158 for (i=0; i<t.text.length(); i++)
159 if (t.text.charAt(i) != ' '
160 && t.text.charAt(i) != '\t')
161 break;
162 if (i > 0)
163 t.text = t.text.substring(i);
164 }
165 previousElement = t;
166 }
167
168 public void visit(HtmlDocument.Comment c) { previousElement = c; }
169 public void visit(HtmlDocument.Newline n) { previousElement = n; }
170 public void visit(HtmlDocument.Annotation a) { previousElement = a; }
171 public void visit(HtmlDocument.TagBlock bl) {
172 if (bl.startTag.tagName.equalsIgnoreCase("PRE")
173 || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
174 || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
175 inPreBlock = true;
176 super.visit(bl);
177 inPreBlock = false;
178 }
179 else
180 super.visit(bl);
181 }
182 }
183
184