1
2 package org.xwt.translators;
3
4 import java.util.*;
5 import java.net.*;
6 import java.io.*;
7 import org.xwt.js.*;
8 import org.xwt.util.*;
9
10
25
26
38 public class HTML {
39
40 private final static String[] noEndTag =
41 new String[] { "area", "base", "basefont", "br", "col", "frame", "hr", "img",
42 "input", "isindex", "link", "meta", "param" };
43
44
45 private static char[] cbuf = null;
46
47
48 private static StringBuffer sbuf = null;
49
50
51 private static boolean withinLI = false;
52
53 public static synchronized JS parseReader(Reader r) throws IOException, JSExn {
54 CharStream cs = new CharStream(r);
55 JS h = new JS();
56
57 withinLI = false;
58 h.put("$name", "html");
59
60 try {
61 while (true) parseBody(cs, h, null);
62 } catch (EOFException e) {
63
64 }
65
66
74 return h;
75 }
76
77
87 private static String parseElement(CharStream cs, JS h) throws IOException, JSExn {
88
89 while(Character.isSpace(cs.peek())) cs.get();
90 String elementName = parseElementName(cs);
91
92 boolean saveWithinLI = withinLI;
93 if (elementName.equals("li")) {
94 if (withinLI) {
95 cs.unread(new char[] { '<', 'l', 'i', ' ' });
96 return "li";
97 } else {
98 withinLI = true;
99 }
100 } else if (elementName.equals("ol") || elementName.equals("ul")) {
101 withinLI = false;
102 }
103
104 h.put("$name", elementName);
105 if (elementName.equals("!--")) {
106 h.put("0", parseComment(cs));
107 h.put("$numchildren", new Integer(0));
108 return null;
109 }
110
111
112 while (cs.peek() != '>') {
113 String name = parseAttributeName(cs);
114 if (name.equals("")) break;
115 String value = expandEntities(parseAttributeValue(cs));
116 h.put(name, value);
117 }
118
119
120 cs.get();
121
122
123 for(int i=0; i<noEndTag.length; i++)
124 if (noEndTag[i].equals(elementName))
125 return null;
126
127
128 String ret = parseBody(cs, h, elementName);
129 withinLI = saveWithinLI;
130 return ret;
131 }
132
133
138 private static String parseBody(CharStream cs, JS h, String elementName) throws IOException, JSExn {
139 String cdata = "";
140 int length = h.get("$numchildren") == null ? 0 : Integer.parseInt(h.get("$numchildren").toString());
141 while(true) {
142 String closetag = null;
143
144 try {
145 char c = cs.get();
146 if (c != '<') { cdata += c; continue; }
147 String expanded = removeRedundantWhitespace(expandEntities(cdata));
148 if (expanded.length() > 0) {
149 h.put(String.valueOf(length), expanded);
150 h.put("$numchildren", new Integer(++length));
151 }
152 cdata = "";
153
154 } catch (EOFException e) {
155 String expanded = removeRedundantWhitespace(expandEntities(cdata));
156 if (expanded.length() > 0) {
157 h.put(String.valueOf(length), expanded);
158 h.put("$numchildren", new Integer(++length));
159 }
160 throw e;
161 }
162
163 try {
164
165 if (cs.peek() != '/') {
166 JS kid = new JS();
167 closetag = parseElement(cs, kid);
168 h.put(String.valueOf(length), kid);
169 h.put("$numchildren", new Integer(++length));
170
171
172 } else {
173 cs.get();
174 closetag = parseElementName(cs);
175 while(cs.get() != '>');
176 }
177 } catch (EOFException e) {
178 throw e;
179
180 }
181
182 if (closetag != null)
183 return closetag.equals(elementName) ? null : closetag;
184 }
185 }
186
187
190 private static String parseElementName(CharStream cs) throws IOException, JSExn {
191 String ret = "";
192 while (cs.peek() != '>' && !Character.isSpace(cs.peek())) ret += cs.get();
193 return ret.toLowerCase();
194 }
195
196
200 private static String parseAttributeName(CharStream cs) throws IOException, JSExn {
201 while(Character.isSpace(cs.peek())) cs.get();
202 String ret = "";
203 while(!Character.isSpace(cs.peek()) && cs.peek() != '=' && cs.peek() != '>') ret += cs.get();
204 return ret.toLowerCase();
205 }
206
207
211 private static String parseAttributeValue(CharStream cs) throws IOException, JSExn {
212
213
214 while(Character.isSpace(cs.peek())) cs.get();
215 if (cs.peek() != '=') return "";
216 cs.get();
217 while(Character.isSpace(cs.peek())) cs.get();
218
219 boolean doublequoted = false;
220 boolean singlequoted = false;
221 String ret = "";
222
223 if (cs.peek() == '\"') { doublequoted = true; cs.get(); }
224 else if (cs.peek() == '\'') { singlequoted = true; cs.get(); }
225
226 while(true) {
227 char c = cs.peek();
228 if (!doublequoted && !singlequoted && (Character.isSpace(c) || c == '>')) break;
229 if (singlequoted && c == '\'') { cs.get(); break; }
230 if (doublequoted && c == '\"') { cs.get(); break; }
231 ret += cs.get();
232 }
233 return ret;
234 }
235
236
239 private static String parseComment(CharStream cs) throws IOException, JSExn {
240 int dashes = 0;
241 String ret = "";
242 while(true) {
243 char c = cs.get();
244 if (c == '>' && dashes == 2) return ret.substring(0, ret.length() - 2);
245 if (c == '-') dashes++;
246 else dashes = 0;
247 ret += c;
248 }
249 }
250
251
252 public static String expandEntities(String s) throws IOException, JSExn {
253 if (s.indexOf('&') == -1) return s;
254 StringBuffer sb = new StringBuffer();
255 int i=0;
256 int nextamp = 0;
257 while(nextamp != -1) {
258 nextamp = s.indexOf('&', i);
259 sb.append(nextamp == -1 ? s.substring(i) : s.substring(i, nextamp));
260 if (nextamp == -1) break;
261 if (s.regionMatches(nextamp, "&", 0, 5)) {
262 sb.append("&");
263 i = nextamp + 5;
264 } else if (s.regionMatches(nextamp, ">", 0, 4)) {
265 sb.append(">");
266 i = nextamp + 4;
267 } else if (s.regionMatches(nextamp, "<", 0, 4)) {
268 sb.append("<");
269 i = nextamp + 4;
270 } else if (s.regionMatches(nextamp, """, 0, 6)) {
271 sb.append("\"");
272 i = nextamp + 6;
273 } else if (s.regionMatches(nextamp, " ", 0, 6)) {
274
275 sb.append(" ");
276 i = nextamp + 6;
277 } else {
278 sb.append("&");
279 i = nextamp + 1;
280 }
281 }
282 return sb.toString();
283 }
284
285
286 private static String removeRedundantWhitespace(String s) throws JSExn {
287
288 if (s.indexOf(' ') == -1 && s.indexOf('\n') == -1 && s.indexOf('\t') == -1 && s.indexOf('\r') == -1) return s;
289
290 int len = s.length();
291 if (cbuf == null || cbuf.length < len) {
292 cbuf = new char[len * 2];
293 sbuf = new StringBuffer(len * 2);
294 }
295 sbuf.setLength(0);
296 s.getChars(0, len, cbuf, 0);
297
298 int last = 0;
299 boolean lastWasWhitespace = false;
300 for(int i=0; i<len; i++) {
301 boolean lastlast = lastWasWhitespace;
302 switch(cbuf[i]) {
303 case '\n': case '\r': case '\t':
304 cbuf[i] = ' ';
305 case ' ':
306 lastWasWhitespace = true;
307 break;
308 default:
309 lastWasWhitespace = false;
310 break;
311 }
312 if (lastWasWhitespace && lastlast) {
313 if (last != i) sbuf.append(cbuf, last, i - last);
314 last = i+1;
315 }
316 }
317
318 if (last != len) sbuf.append(cbuf, last, len - last);
319 return sbuf.toString().trim();
320 }
321
322
323
324 private static class CharStream extends PushbackReader {
325 public CharStream(Reader r) { super(r, 1024); }
326
327 public char peek() throws IOException {
328 char c = get();
329 unread(c);
330 return c;
331 }
332
333 public char get() throws IOException {
334 int i = read();
335 if (i == -1) throw new EOFException();
336 return (char)i;
337 }
338 }
339
340 }
341
342