Coverage Report

Coverage Report - com.quiotix.html.parser.HtmlScrubber

Classes in this File

Line Coverage

Branch Coverage

Complexity

HtmlScrubber

51%

35/68

28%

24/84

 /*
  * HtmlScrubber.java -- cleans up HTML document tree.  
  * Copyright (C) 1999 Quiotix Corporation.  
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License, version 2, as 
  * published by the Free Software Foundation.  
  *
  * This program is distributed in the hope that it will be useful,
  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  * GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
  * for more details.
  */
 
 package com.quiotix.html.parser;
 import java.util.Iterator;
 
 /**
  * HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
  * It can change tags and tag attributes to uppercase or lowercase, strip
  * out unnecessary quotes from attribute values, and strip trailing spaces
  * before a newline.
  *
  * @author Brian Goetz, Quiotix
  * Additional contributions by: Thorsten Weber
  */
 
 public class HtmlScrubber extends HtmlVisitor {
 
     /** Set tag case to upper. */
     public static final int TAGS_UPCASE     = 1;
     /** Set tag case to lower. */
     public static final int TAGS_DOWNCASE   = 2;
     /** Set attribute case to upper. */
     public static final int ATTR_UPCASE     = 4;
     /** Set attribute case to lower. */
     public static final int ATTR_DOWNCASE   = 8;
     /** Remove quotes. */
     public static final int STRIP_QUOTES    = 16;
     /** Trim spaces. */
     public static final int TRIM_SPACES     = 32;
     /** Quote attributes. */
     public static final int QUOTE_ATTRS     = 64;
     /** Defaults: downcase tags and attributes, quote attributes. */
     public static final int DEFAULT_OPTIONS =
             TAGS_DOWNCASE | ATTR_DOWNCASE | QUOTE_ATTRS;
 
     protected int flags;
     protected HtmlDocument.HtmlElement previousElement;
     protected boolean inPreBlock;
 
     /** 
      * Create an HtmlScrubber with the default options 
      * (downcase tags and tag attributes, strip out unnecessary quotes).
      */
     public HtmlScrubber() {
         this(DEFAULT_OPTIONS);
     };
 
     /** 
      * Create an HtmlScrubber with the desired set of options.
      * @param flags A bitmask representing the desired scrubbing options
      */
     public HtmlScrubber(int flags) {
         this.flags = flags;
     };
 
     private static boolean safeToUnquote(String qs) {
         int upperCount=0, lowerCount=0, idCount=0;
 
         for (int i=1; i < qs.length()-1; i++) {
             char c = qs.charAt(i);
             if (Character.isUnicodeIdentifierPart(c))
                 ++idCount;
             if (Character.isUpperCase(c))
                 ++upperCount;
             else if (Character.isLowerCase(c))
                 ++lowerCount;
         }
         return (qs.length()-2 > 0
                 && (qs.length()-2 == idCount
                 && (upperCount == 0 || lowerCount == 0)));
     }
 
     private static boolean isSingleQuoted(String s) {
       if (s.charAt(0) =='\'' && s.charAt(s.length()-1) == '\'') {
         return true;
       }
       else return false;
         
     }
     private static boolean isDoubleQuoted(String s) {
       if (s.charAt(0) =='"' && s.charAt(s.length()-1) == '"') {
         return true;
       }
       else return false;
         
     }
     private static boolean isQuoted(String s) {
       return isDoubleQuoted(s) || isSingleQuoted(s);
     }
 
     public void start() {
         previousElement = null;
         inPreBlock = false;
     }
 
     public void visit(HtmlDocument.Tag t) {
         if ((flags & TAGS_UPCASE) != 0)
             t.tagName = t.tagName.toUpperCase();
         else if ((flags & TAGS_DOWNCASE) != 0)
             t.tagName = t.tagName.toLowerCase();
         for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
             HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
             if ((flags & ATTR_UPCASE) != 0)
                 a.name = a.name.toUpperCase();
             else if ((flags & ATTR_DOWNCASE) != 0)
                 a.name = a.name.toLowerCase();
             if (((flags & STRIP_QUOTES) != 0)
                 && a.hasValue
                 && isQuoted(a.value)
                 && safeToUnquote(a.value)) {
               a.value = a.value.substring(1, a.value.length()-1);
             }
             if (((flags & QUOTE_ATTRS) != 0)
                 && a.hasValue) {
               if (!isDoubleQuoted(a.value)) {
                 if (isSingleQuoted(a.value)) {
                   a.value = a.value.substring(1, a.value.length()-1);
                 } 
                 a.value = "\"" + a.value + "\"";
               }
               //System.err.println(a.value);
             }
         }
 
         previousElement = t;
     }
 
     public void visit(HtmlDocument.EndTag t) {
         if ((flags & TAGS_UPCASE) != 0)
             t.tagName = t.tagName.toUpperCase();
         else if ((flags & TAGS_DOWNCASE) != 0)
             t.tagName = t.tagName.toLowerCase();
 
         previousElement = t;
     }
 
     public void visit(HtmlDocument.Text t) {
         if (((flags & TRIM_SPACES) != 0)
                 && !inPreBlock
                 && (previousElement instanceof HtmlDocument.Newline
                 || previousElement instanceof HtmlDocument.Tag
                 || previousElement instanceof HtmlDocument.EndTag
                 || previousElement instanceof HtmlDocument.Comment)) {
             int i;
             for (i=0; i<t.text.length(); i++)
                 if (t.text.charAt(i) != ' '
                         && t.text.charAt(i) != '\t')
                     break;
             if (i > 0)
                 t.text = t.text.substring(i);
         }
         previousElement = t;
     }
 
     public void visit(HtmlDocument.Comment c)     { previousElement = c; }
     public void visit(HtmlDocument.Newline n)     { previousElement = n; }
     public void visit(HtmlDocument.Annotation a)  { previousElement = a; }
     public void visit(HtmlDocument.TagBlock bl) {
         if (bl.startTag.tagName.equalsIgnoreCase("PRE")
                 || bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
                 || bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
             inPreBlock = true;
             super.visit(bl);
             inPreBlock = false;
         }
         else
             super.visit(bl);
     }
 }
 
 

1		/*
2		* HtmlScrubber.java -- cleans up HTML document tree.
3		* Copyright (C) 1999 Quiotix Corporation.
4		*
5		* This program is free software; you can redistribute it and/or modify
6		* it under the terms of the GNU General Public License, version 2, as
7		* published by the Free Software Foundation.
8		*
9		* This program is distributed in the hope that it will be useful,
10		* but WITHOUT ANY WARRANTY; without even the implied warranty of
11		* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12		* GNU General Public License (http://www.gnu.org/copyleft/gpl.txt)
13		* for more details.
14		*/
15
16		package com.quiotix.html.parser;
17		import java.util.Iterator;
18
19		/**
20		* HtmlScrubber is a Visitor which walks an HtmlDocument and cleans it up.
21		* It can change tags and tag attributes to uppercase or lowercase, strip
22		* out unnecessary quotes from attribute values, and strip trailing spaces
23		* before a newline.
24		*
25		* @author Brian Goetz, Quiotix
26		* Additional contributions by: Thorsten Weber
27		*/
28
29		public class HtmlScrubber extends HtmlVisitor {
30
31		/** Set tag case to upper. */
32		public static final int TAGS_UPCASE = 1;
33		/** Set tag case to lower. */
34		public static final int TAGS_DOWNCASE = 2;
35		/** Set attribute case to upper. */
36		public static final int ATTR_UPCASE = 4;
37		/** Set attribute case to lower. */
38		public static final int ATTR_DOWNCASE = 8;
39		/** Remove quotes. */
40		public static final int STRIP_QUOTES = 16;
41		/** Trim spaces. */
42		public static final int TRIM_SPACES = 32;
43		/** Quote attributes. */
44		public static final int QUOTE_ATTRS = 64;
45		/** Defaults: downcase tags and attributes, quote attributes. */
46		public static final int DEFAULT_OPTIONS =
47		TAGS_DOWNCASE \| ATTR_DOWNCASE \| QUOTE_ATTRS;
48
49		protected int flags;
50		protected HtmlDocument.HtmlElement previousElement;
51		protected boolean inPreBlock;
52
53		/**
54		* Create an HtmlScrubber with the default options
55		* (downcase tags and tag attributes, strip out unnecessary quotes).
56		*/
57		public HtmlScrubber() {
58	0	this(DEFAULT_OPTIONS);
59	0	};
60
61		/**
62		* Create an HtmlScrubber with the desired set of options.
63		* @param flags A bitmask representing the desired scrubbing options
64		*/
65	4	public HtmlScrubber(int flags) {
66	4	this.flags = flags;
67	4	};
68
69		private static boolean safeToUnquote(String qs) {
70	0	int upperCount=0, lowerCount=0, idCount=0;
71
72	0	for (int i=1; i < qs.length()-1; i++) {
73	0	char c = qs.charAt(i);
74	0	if (Character.isUnicodeIdentifierPart(c))
75	0	++idCount;
76	0	if (Character.isUpperCase(c))
77	0	++upperCount;
78	0	else if (Character.isLowerCase(c))
79	0	++lowerCount;
80		}
81	0	return (qs.length()-2 > 0
82		&& (qs.length()-2 == idCount
83		&& (upperCount == 0 \|\| lowerCount == 0)));
84		}
85
86		private static boolean isSingleQuoted(String s) {
87	4	if (s.charAt(0) =='\'' && s.charAt(s.length()-1) == '\'') {
88	0	return true;
89		}
90	4	else return false;
91
92		}
93		private static boolean isDoubleQuoted(String s) {
94	4	if (s.charAt(0) =='"' && s.charAt(s.length()-1) == '"') {
95	0	return true;
96		}
97	4	else return false;
98
99		}
100		private static boolean isQuoted(String s) {
101	0	return isDoubleQuoted(s) \|\| isSingleQuoted(s);
102		}
103
104		public void start() {
105	4	previousElement = null;
106	4	inPreBlock = false;
107	4	}
108
109		public void visit(HtmlDocument.Tag t) {
110	20	if ((flags & TAGS_UPCASE) != 0)
111	0	t.tagName = t.tagName.toUpperCase();
112	20	else if ((flags & TAGS_DOWNCASE) != 0)
113	20	t.tagName = t.tagName.toLowerCase();
114	20	for (Iterator it=t.attributeList.attributes.iterator(); it.hasNext(); ) {
115	4	HtmlDocument.Attribute a = (HtmlDocument.Attribute) it.next();
116	4	if ((flags & ATTR_UPCASE) != 0)
117	0	a.name = a.name.toUpperCase();
118	4	else if ((flags & ATTR_DOWNCASE) != 0)
119	4	a.name = a.name.toLowerCase();
120	4	if (((flags & STRIP_QUOTES) != 0)
121		&& a.hasValue
122		&& isQuoted(a.value)
123		&& safeToUnquote(a.value)) {
124	0	a.value = a.value.substring(1, a.value.length()-1);
125		}
126	4	if (((flags & QUOTE_ATTRS) != 0)
127		&& a.hasValue) {
128	4	if (!isDoubleQuoted(a.value)) {
129	4	if (isSingleQuoted(a.value)) {
130	0	a.value = a.value.substring(1, a.value.length()-1);
131		}
132	4	a.value = "\"" + a.value + "\"";
133		}
134		//System.err.println(a.value);
135		}
136	4	}
137
138	20	previousElement = t;
139	20	}
140
141		public void visit(HtmlDocument.EndTag t) {
142	0	if ((flags & TAGS_UPCASE) != 0)
143	0	t.tagName = t.tagName.toUpperCase();
144	0	else if ((flags & TAGS_DOWNCASE) != 0)
145	0	t.tagName = t.tagName.toLowerCase();
146
147	0	previousElement = t;
148	0	}
149
150		public void visit(HtmlDocument.Text t) {
151	8	if (((flags & TRIM_SPACES) != 0)
152		&& !inPreBlock
153		&& (previousElement instanceof HtmlDocument.Newline
154		\|\| previousElement instanceof HtmlDocument.Tag
155		\|\| previousElement instanceof HtmlDocument.EndTag
156		\|\| previousElement instanceof HtmlDocument.Comment)) {
157		int i;
158	6	for (i=0; i<t.text.length(); i++)
159	6	if (t.text.charAt(i) != ' '
160		&& t.text.charAt(i) != '\t')
161	4	break;
162	4	if (i > 0)
163	2	t.text = t.text.substring(i);
164		}
165	8	previousElement = t;
166	8	}
167
168	0	public void visit(HtmlDocument.Comment c) { previousElement = c; }
169	4	public void visit(HtmlDocument.Newline n) { previousElement = n; }
170	0	public void visit(HtmlDocument.Annotation a) { previousElement = a; }
171		public void visit(HtmlDocument.TagBlock bl) {
172	0	if (bl.startTag.tagName.equalsIgnoreCase("PRE")
173		\|\| bl.startTag.tagName.equalsIgnoreCase("SCRIPT")
174		\|\| bl.startTag.tagName.equalsIgnoreCase("STYLE")) {
175	0	inPreBlock = true;
176	0	super.visit(bl);
177	0	inPreBlock = false;
178		}
179		else
180	0	super.visit(bl);
181	0	}
182		}
183
184