1
2 package org.xwt;
3
4 import java.util.*;
5 import java.net.*;
6 import java.io.*;
7 import org.xwt.js.*;
8 import org.xwt.util.*;
9
10
25
26
38 public class HTML {
39
40
41 private final static String[] bodylessTags = new String[] { "br", "hr", "input", "img", "isindex" };
42
43
44 private static char[] cbuf = null;
45
46
47 private static StringBuffer sbuf = null;
48
49
50 private static boolean withinLI = false;
51
52 public static synchronized JS parseReader(Reader r) throws IOException {
53 CharStream cs = new CharStream(r);
54 JS.Obj h = new JS.Obj();
55
56 withinLI = false;
57 h.put("$name", "html");
58
59 try {
60 while (true) parseBody(cs, h, null);
61 } catch (EOFException e) {
62
63 }
64
65 Object[] ids = h.keys();
66 for(int i=0; i<ids.length; i++) {
67 Object el = h.get((String)ids[i]);
68 if (el instanceof JS && "html".equals(((JS)el).get("$name")))
69 return (JS)el;
70 }
71
72 return h;
73 }
74
75
85 private static String parseElement(CharStream cs, JS h) throws IOException {
86
87 while(Character.isSpace(cs.peek())) cs.get();
88 String elementName = parseElementName(cs);
89
90
91 boolean saveWithinLI = withinLI;
92 if (elementName.equals("li")) {
93 if (withinLI) {
94 cs.unread(new char[] { '<', 'l', 'i', ' ' });
95 return "li";
96 } else {
97 withinLI = true;
98 }
99 } else if (elementName.equals("ol") || elementName.equals("ul")) {
100 withinLI = false;
101 }
102
103 h.put("$name", elementName);
104 if (elementName.equals("!--")) {
105 h.put("0", parseComment(cs));
106 h.put("$numchildren", new Integer(0));
107 return null;
108 }
109
110
111 while (cs.peek() != '>') {
112 String name = parseAttributeName(cs);
113 if (name.equals("")) break;
114 String value = expandEntities(parseAttributeValue(cs));
115 h.put(name, value);
116 }
117
118
119 cs.get();
120
121
122 for(int i=0; i<bodylessTags.length; i++)
123 if (bodylessTags[i].equals(elementName))
124 return null;
125
126
127 String ret = parseBody(cs, h, elementName);
128 withinLI = saveWithinLI;
129 return ret;
130 }
131
132
137 private static String parseBody(CharStream cs, JS h, String elementName) throws IOException {
138 String cdata = "";
139 int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
140 while(true) {
141 String closetag = null;
142
143 try {
144 char c = cs.get();
145 if (c != '<') { cdata += c; continue; }
146 String expanded = removeRedundantWhitespace(expandEntities(cdata));
147 if (expanded.length() > 0) {
148 h.put(String.valueOf(length), expanded);
149 h.put("$numchildren", new Integer(++length));
150 }
151 cdata = "";
152
153 } catch (EOFException e) {
154 String expanded = removeRedundantWhitespace(expandEntities(cdata));
155 if (expanded.length() > 0) {
156 h.put(String.valueOf(length), expanded);
157 h.put("$numchildren", new Integer(++length));
158 }
159 throw e;
160 }
161
162 try {
163
164 if (cs.peek() != '/') {
165 JS kid = new JS.Obj();
166 closetag = parseElement(cs, kid);
167 h.put(String.valueOf(length), kid);
168 h.put("$numchildren", new Integer(++length));
169
170
171 } else {
172 cs.get();
173 closetag = parseElementName(cs);
174 while(cs.get() != '>');
175 }
176 } catch (EOFException e) {
177 throw e;
178
179 }
180
181 if (closetag != null)
182 return closetag.equals(elementName) ? null : closetag;
183 }
184 }
185
186
189 private static String parseElementName(CharStream cs) throws IOException {
190 String ret = "";
191 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
192 return ret.toLowerCase();
193 }
194
195
199 private static String parseAttributeName(CharStream cs) throws IOException {
200 while(Character.isSpace(cs.peek())) cs.get();
201 String ret = "";
202 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
203 return ret.toLowerCase();
204 }
205
206
210 private static String parseAttributeValue(CharStream cs) throws IOException {
211
212
213 while(Character.isSpace(cs.peek())) cs.get();
214 if (cs.peek() != '=') return "";
215 cs.get();
216 while(Character.isSpace(cs.peek())) cs.get();
217
218 boolean doublequoted = false;
219 boolean singlequoted = false;
220 String ret = "";
221
222 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
223 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
224
225 while(true) {
226 char c = cs.peek();
227 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
228 if (singlequoted && c == '\'') { cs.get(); break; }
229 if (doublequoted && c == '\"') { cs.get(); break; }
230 ret += cs.get();
231 }
232 return ret;
233 }
234
235
238 private static String parseComment(CharStream cs) throws IOException {
239 int dashes = 0;
240 String ret = "";
241 while(true) {
242 char c = cs.get();
243 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
244 if (c == '-') dashes++;
245 else dashes = 0;
246 ret += c;
247 }
248 }
249
250
251 public static String expandEntities(String s) throws IOException {
252 if (s.indexOf('&') == -1) return s;
253 StringBuffer sb = new StringBuffer();
254 int i=0;
255 int nextamp = 0;
256 while(nextamp != -1) {
257 nextamp = s.indexOf('&', i);
258 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
259 if (nextamp == -1) break;
260 if (s.regionMatches(nextamp, "&", 0, 5)) {
261 sb.append("&");
262 i = nextamp + 5;
263 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
264 sb.append(">");
265 i = nextamp + 4;
266 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
267 sb.append("<");
268 i = nextamp + 4;
269 } else if (s.regionMatches(nextamp, """, 0, 6)) {
270 sb.append("\"");
271 i = nextamp + 6;
272 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
273
274 sb.append(" ");
275 i = nextamp + 6;
276 } else {
277 sb.append("&");
278 i = nextamp + 1;
279 }
280 }
281 return sb.toString();
282 }
283
284
285
286 private static String removeRedundantWhitespace(String s) {
287
288 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
289
290 int len = s.length();
291 if (cbuf == null || cbuf.length < len) {
292 cbuf = new char[len * 2];
293 sbuf = new StringBuffer(len * 2);
294 }
295 sbuf.setLength(0);
296 s.getChars(0, len, cbuf, 0);
297
298 int last = 0;
299 boolean lastWasWhitespace = false;
300 for(int i=0; i<len; i++) {
301 boolean lastlast = lastWasWhitespace;
302 switch(cbuf[i]) {
303 case '\n': case '\r': case '\t':
304 cbuf[i] = ' ';
305 case ' ':
306 lastWasWhitespace = true;
307 break;
308 default:
309 lastWasWhitespace = false;
310 break;
311 }
312 if (lastWasWhitespace && lastlast) {
313 if (last != i) sbuf.append(cbuf, last, i - last);
314 last = i+1;
315 }
316 }
317
318 if (last != len) sbuf.append(cbuf, last, len - last);
319 return sbuf.toString().trim();
320 }
321
322
323
324 private static class CharStream extends PushbackReader {
325 public CharStream(Reader r) { super(r, 1024); }
326
327 public char peek() throws IOException {
328 char c = get();
329 unread(c);
330 return c;
331 }
332
333 public char get() throws IOException {
334 int i = read();
335 if (i == -1) throw new EOFException();
336 return (char)i;
337 }
338 }
339
340 }
341
342