001/*
002// $Id: //open/util/resgen/src/org/eigenbase/xom/XMLUtil.java#5 $
003// Package org.eigenbase.xom is an XML Object Mapper.
004// Copyright (C) 2005-2005 The Eigenbase Project
005// Copyright (C) 2005-2005 Disruptive Tech
006// Copyright (C) 2005-2005 LucidEra, Inc.
007// Portions Copyright (C) 2001-2005 Kana Software, Inc. and others.
008//
009// This library is free software; you can redistribute it and/or modify it
010// under the terms of the GNU Lesser General Public License as published by the
011// Free Software Foundation; either version 2 of the License, or (at your
012// option) any later version approved by The Eigenbase Project.
013//
014// This library is distributed in the hope that it will be useful,
015// but WITHOUT ANY WARRANTY; without even the implied warranty of
016// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
017// GNU Lesser General Public License for more details.
018//
019// You should have received a copy of the GNU Lesser General Public License
020// along with this library; if not, write to the Free Software
021// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
022//
023// jhyde, 3 October, 2001
024*/
025
026package org.eigenbase.xom;
027import java.io.IOException;
028import java.io.PrintWriter;
029import java.io.Reader;
030
031/**
032 * Utilities for dealing with XML data.  These methods must NOT depend upon any
033 * XML parser or object model (MSXML, DOM, SAX, etc.)
034 *
035 * @author jhyde
036 * @since 3 October, 2001
037 * @version $Id: //open/util/resgen/src/org/eigenbase/xom/XMLUtil.java#5 $
038 **/
039public class XMLUtil {
040
041    /**
042     * Determine if a String contains any XML special characters, return true
043     * if it does.  If this function returns true, the string will need to be
044     * encoded either using the stringEncodeXML function above or using a
045     * CDATA section.  Note that MSXML has a nasty bug whereby whitespace
046     * characters outside of a CDATA section are lost when parsing.  To
047     * avoid hitting this bug, this method treats many whitespace characters
048     * as "special".
049     * @param input the String to scan for XML special characters.
050     * @return true if the String contains any such characters.
051     */
052    public static boolean stringHasXMLSpecials(String input)
053    {
054        for (int i = 0; i < input.length(); i++) {
055            char c = input.charAt(i);
056            switch (c) {
057            case '<':
058            case '>':
059            case '"':
060            case '\'':
061            case '&':
062            case '\t':
063            case '\n':
064            case '\r':
065                return true;
066            }
067        }
068        return false;
069    }
070
071    /**
072     * Encode a String for XML output, displaying it to a PrintWriter.
073     * The String to be encoded is displayed, except that
074     * special characters are converted into entities.
075     * @param input a String to convert.
076     * @param out a PrintWriter to which to write the results.
077     */
078    public static void stringEncodeXML(String input, PrintWriter out)
079    {
080        for (int i = 0; i < input.length(); i++) {
081            char c = input.charAt(i);
082            switch (c) {
083            case '<':
084            case '>':
085            case '"':
086            case '\'':
087            case '&':
088            case '\t':
089            case '\n':
090            case '\r':
091                out.print("&#" + (int)c + ";");
092                break;
093            default:
094                out.print(c);
095            }
096        }
097    }
098
099    /**
100     * Quote a string, and write to a {@link PrintWriter}.
101     *
102     * <p>For example, <code>"a string"</code> becomes <code>&lt![CDATA[a
103     * string]]&gt;</code>.  If the string contains ']]&gt;' (which commonly
104     * occurs when wrapping other XML documents), we give up on using
105     * <code>&lt![CDATA[</code> ... <code>]]&gt;</code>, and just encode the
106     * string.  For example, <code>"A string with ]]&gt; in it"</code> becomes
107     * <code>"A string with ]]&amp;gt; in it"</code>.</p>
108     **/
109    public static void printPCDATA(PrintWriter pw, String data)
110    {
111        if (data.indexOf("]]>") > -1) {
112            String s = StringEscaper.xmlEscaper.escapeString(data);
113            pw.print(s);
114        } else {
115            pw.print("<![CDATA[");
116            pw.print(data);
117            pw.print("]]>");
118        }
119    }
120
121    /**
122     * Quote a string.
123     *
124     * @see #printPCDATA(PrintWriter,String)
125     **/
126    public static String quotePCDATA(String data)
127    {
128        if (data.indexOf("]]>") > -1) {
129            return StringEscaper.xmlEscaper.escapeString(data);
130        } else {
131            return "<![CDATA[" + data + "]]>";
132        }
133    }
134
135    /**
136     * Quote a string in an element and a CDATA, and write to a {@link
137     * PrintWriter}.  For example, it <code>tag</code> is "Value", then
138     * <code>"a string"</code> becomes <code>&ltValue&gt;&lt![CDATA[a
139     * string]]&gt;&lt/Value&gt;.
140     *
141     * @param newline whether to print a newline after the element
142     * @see #printPCDATA(PrintWriter,String)
143     **/
144    public static void printPCDATA(
145        PrintWriter pw, String tag, String data, boolean newline)
146    {
147        if (data == null || data.length() == 0) {
148            return;
149        }
150        pw.print("<");
151        pw.print(tag);
152        pw.print(">");
153        printPCDATA(pw,data);
154        pw.print("</");
155        pw.print(tag);
156        pw.print(">");
157        if (newline) {
158            pw.println();
159        }
160    }
161
162    public static void printPCDATA(PrintWriter pw, String tag, String data)
163    {
164        boolean newline = false;
165        printPCDATA(pw, tag, data, newline);
166    }
167
168    private static String escapeForQuoting(String val)
169    {
170        return StringEscaper.xmlNumericEscaper.escapeString(val);
171    }
172
173    /** Quote a string so that it can be included as an XML attribute value. */
174    public static String quoteAtt(String val)
175    {
176        return "\"" + escapeForQuoting(val) + "\"";
177    }
178
179    /** Return an XML attribute/value pair for String val */
180    public static String quoteAtt(String name, String val)
181    {
182        if ((val == null) || val.equals("")) {
183            return "";
184        }
185        return " " + name + "=" + quoteAtt(val);
186    }
187
188    /** Return an XML attribute/value pair for int val */
189    public static String quoteAtt(String name, int val)
190    {
191        return " " + name + "=\"" + val + "\"";
192    }
193
194    /** Return an XML attribute/value pair for boolean val */
195    public static String quoteAtt(String name, boolean val)
196    {
197        return " " + name + "=\"" + (val ? "TRUE" : "FALSE") + "\"";
198    }
199
200    /** Quote a string so that it can be included as an XML attribute value. */
201    public static void printAtt(PrintWriter pw, String val)
202    {
203        pw.print("\"");
204        pw.print(escapeForQuoting(val));
205        pw.print("\"");
206    }
207
208    /** Print an XML attribute name and value for string val */
209    public static void printAtt(PrintWriter pw, String name, String val)
210    {
211        if (val != null /* && !val.equals("") */) {
212            pw.print(" ");
213            pw.print(name);
214            pw.print("=\"");
215            pw.print(escapeForQuoting(val));
216            pw.print("\"");
217        }
218    }
219
220    /** Print an XML attribute name and value for int val */
221    public static void printAtt(PrintWriter pw, String name, int val)
222    {
223        pw.print(" ");
224        pw.print(name);
225        pw.print("=\"");
226        pw.print(val);
227        pw.print("\"");
228    }
229
230    /** Print an XML attribute name and value for boolean val */
231    public static void printAtt(PrintWriter pw, String name, boolean val)
232    {
233        pw.print(" ");
234        pw.print(name);
235        pw.print(val ? "=\"true\"" : "=\"false\"");
236    }
237
238    /**
239     * Retrieve the name of the first tag in the XML document specified by the
240     * given Reader, without parsing the full file/string.  This function is
241     * useful to identify the DocType of an XML document before parsing,
242     * possibly to send the document off to different pieces of code.
243     * For performance reasons, the function attempts to read as little of
244     * the file or string as possible before making its decision about the
245     * first tag.  Leading comments are ignored.
246     * @param xml a Reader containing an XML document.
247     * @return the first tag name, as a String, or null if no first tag
248     * can be found.
249     */
250    public static String getFirstTagName(Reader xml)
251    {
252        final int OUTSIDE = 0;  // constant: identify outside state
253        final int BRACKET = 1;  // constant: bracket, contents unknown
254        final int COMMENT = 2;  // constant: identify a comment section
255        final int IGNORE = 3;   // constant: identify an ignored section
256        final int TAG = 4;      // constant: identify a tag section
257
258        int state = OUTSIDE;
259        String commentMatch = null;
260        StringBuffer tagBuffer = null;
261        boolean sawBang = false;
262
263        try {
264            int c = xml.read();
265            for (;;) {
266                // No tag found if we hit EOF first.
267                if (c == -1) {
268                    return null;
269                }
270                switch (state) {
271                case OUTSIDE:
272                    // Start of any sort of tag
273                    if (c == '<') {
274                        state = BRACKET;
275                        commentMatch = "!--";
276                        sawBang = false;
277                        c = xml.read();
278
279                        // Other non-whitespace characters outside of any tag
280                    } else if (!Character.isWhitespace((char) c)) {
281                        return null;
282
283                        // Whitespace characters are ignored
284                    } else {
285                        c = xml.read();
286                    }
287                    break;
288
289                case BRACKET:
290                    // Check for the start of a comment.
291                    if (commentMatch != null) {
292                        if (c == commentMatch.charAt(0)) {
293                            // This match indicates a comment
294                            if (commentMatch.length() == 1) {
295                                c = xml.read();
296                                commentMatch = "-->";
297                                state = COMMENT;
298                            } else {
299                                // Remove the first character from commentMatch,
300                                // then process the character as usual.
301                                commentMatch =
302                                    commentMatch.substring(1, commentMatch.length());
303                            }
304                        } else {
305                            // No longer eligible for comment.
306                            commentMatch = null;
307                        }
308                    }
309
310                    // Hit whitespace; ignore the character.
311                    if (Character.isWhitespace((char) c)) {
312                        c = xml.read();
313                        break;
314                    }
315
316                    switch (c) {
317                    case '?':
318                        c = xml.read();
319                        state = IGNORE;
320                        break;
321                    case '!':
322                        // Enter an ignored section unless eligible for comment.
323                        c = xml.read();
324                        sawBang = true;
325                        if (commentMatch == null) {
326                            state = IGNORE;
327                        }
328                        break;
329                    case '-':
330                        // Enter an ignored section unless eligible for comment.
331                        c = xml.read();
332                        if (commentMatch == null) {
333                            state = IGNORE;
334                        }
335                        break;
336                    case '>':
337                        // Return to OUTSIDE state immediately
338                        c = xml.read();
339                        state = OUTSIDE;
340                        break;
341                    default:
342                        // State depends on whether we saw a ! or not.
343                        if (sawBang) {
344                            state = IGNORE;
345                        } else {
346                            state = TAG;
347                        }
348                        tagBuffer = new StringBuffer();
349                    }
350                    break;
351
352                case COMMENT:
353                    // Did we match the next expected end-of-comment character?
354                    if (c == commentMatch.charAt(0)) {
355                        c = xml.read();
356                        if (commentMatch.length() == 1) {
357                            // Done with the comment
358                            state = OUTSIDE;
359                        } else {
360                            commentMatch =
361                                commentMatch.substring(1, commentMatch.length());
362                        }
363                    } else {
364                        // If not, restart our quest for the end-of-comment character.
365                        c = xml.read();
366                        commentMatch = "-->";
367                    }
368                    break;
369
370                case IGNORE:
371                    // Drop out on a close >.  Ignore all other characters.
372                    if (c == '>') {
373                        c = xml.read();
374                        state = OUTSIDE;
375                    } else {
376                        c = xml.read();
377                    }
378                    break;
379
380                case TAG:
381                    // Store characters in the tag buffer until we hit whitespace.
382                    // When we hit whitespace or '>' or '/', return the name of the tag.
383                    if (Character.isWhitespace((char)c) || c == '>'
384                        || c == '/') {
385                        return tagBuffer.toString();
386                    } else {
387                        tagBuffer.append((char)c);
388                        c = xml.read();
389                    }
390                    break;
391                }
392            }
393        } catch (IOException ex) {
394            // On exception, we can't determine the first tag, so return null.
395            return null;
396        }
397    }
398}
399
400
401// End XMLUtil.java