001/*
002 * Licensed to the Apache Software Foundation (ASF) under one or more
003 * contributor license agreements.  See the NOTICE file distributed with
004 * this work for additional information regarding copyright ownership.
005 * The ASF licenses this file to You under the Apache License, Version 2.0
006 * (the "License"); you may not use this file except in compliance with
007 * the License.  You may obtain a copy of the License at
008 *
009 *      http://www.apache.org/licenses/LICENSE-2.0
010 *
011 * Unless required by applicable law or agreed to in writing, software
012 * distributed under the License is distributed on an "AS IS" BASIS,
013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
014 * See the License for the specific language governing permissions and
015 * limitations under the License.
016 */
017package org.apache.wicket.util.encoding;
018
019import java.io.ByteArrayOutputStream;
020import java.io.UnsupportedEncodingException;
021import java.nio.charset.Charset;
022import java.nio.charset.IllegalCharsetNameException;
023import java.nio.charset.UnsupportedCharsetException;
024
025import org.apache.wicket.util.lang.Args;
026
027/**
028 * Adapted from Spring Framework's UriUtils class, but defines instances for query string encoding versus URL path
029 * component encoding.
030 * <p/>
031 * The difference is important because a space is encoded as a + in a query string, but this is a
032 * valid value in a path component (and is therefore not decode back to a space).
033 *
034 * @author Doug Donohoe
035 * @author Thomas Heigl
036 * @see <a href="http://www.ietf.org/rfc/rfc2396.txt">RFC-2396</a>
037 */
038public class UrlEncoder
039{
040
041        enum Type {
042                //@formatter:off
043                QUERY {
044                        @Override
045                        public boolean isAllowed(int c) 
046                        {
047                                return isPchar(c) ||
048                                                ' ' == c || // encoding a space to a + is done in the encode() method
049                                                '*' == c ||
050                                                '/' == c || // to allow direct passing of URL in query
051                                                ',' == c ||
052                                                ':' == c || // allowed and used in wicket interface
053                                                '@' == c ;
054                        }
055                },
056                PATH {
057                        @Override
058                        public boolean isAllowed(int c) 
059                        {
060                                return isPchar(c) ||
061                                                '*' == c ||
062                                                '&' == c ||
063                                                '+' == c ||
064                                                ',' == c ||
065                                                ';' == c || // semicolon is used in ;jsessionid=
066                                                '=' == c ||
067                                                ':' == c || // allowed and used in wicket interface
068                                                '@' == c ;
069
070                        }
071                },
072                HEADER {
073                        @Override
074                        public boolean isAllowed(int c) 
075                        {
076                                return isPchar(c) ||
077                                                '#' == c ||
078                                                '&' == c ||
079                                                '+' == c ||
080                                                '^' == c ||
081                                                '`' == c ||
082                                                '|' ==c;
083                        }
084                };
085                //@formatter:on
086
087                /**
088                 * Indicates whether the given character is allowed in this URI component.
089                 * @return {@code true} if the character is allowed; {@code false} otherwise
090                 */
091                public abstract boolean isAllowed(int c);
092
093                /**
094                 * Indicates whether the given character is in the {@code ALPHA} set.
095                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
096                 */
097                protected boolean isAlpha(int c)
098                {
099                        return (c >= 'a' && c <= 'z' || c >= 'A' && c <= 'Z');
100                }
101
102                /**
103                 * Indicates whether the given character is in the {@code DIGIT} set.
104                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
105                 */
106                protected boolean isDigit(int c)
107                {
108                        return (c >= '0' && c <= '9');
109                }
110
111                /**
112                 * Indicates whether the given character is in the {@code sub-delims} set.
113                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
114                 */
115                protected boolean isSubDelimiter(int c)
116                {
117                        return ('!' == c || '$' == c);
118                }
119
120                /**
121                 * Indicates whether the given character is in the {@code unreserved} set.
122                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
123                 */
124                protected boolean isUnreserved(int c)
125                {
126                        return (isAlpha(c) || isDigit(c) || '-' == c || '.' == c || '_' == c || '~' == c);
127                }
128
129                /**
130                 * Indicates whether the given character is in the {@code pchar} set.
131                 * @see <a href="https://www.ietf.org/rfc/rfc3986.txt">RFC 3986, appendix A</a>
132                 */
133                protected boolean isPchar(int c)
134                {
135                        return (isUnreserved(c) || isSubDelimiter(c));
136                }
137        }
138
139        private final Type type;
140
141        /**
142         * Encoder used to encode name or value components of a query string.<br/>
143         * <br/>
144         *
145         * For example: http://org.acme/notthis/northis/oreventhis?buthis=isokay&amp;asis=thispart
146         */
147        public static final UrlEncoder QUERY_INSTANCE = new UrlEncoder(Type.QUERY);
148
149        /**
150         * Encoder used to encode segments of a path.<br/>
151         * <br/>
152         *
153         * For example: http://org.acme/foo/thispart/orthispart?butnot=thispart
154         */
155        public static final UrlEncoder PATH_INSTANCE = new UrlEncoder(Type.PATH);
156
157        /**
158         * Encoder used to encode a header.
159         */
160        public static final UrlEncoder HEADER_INSTANCE = new UrlEncoder(Type.HEADER);
161
162        /**
163         * Allow subclass to call constructor.
164         *
165         * @param type
166         *            encoder type
167         */
168        protected UrlEncoder(final Type type)
169        {
170                this.type = type;
171        }
172
173        /**
174         * @param s
175         *            string to encode
176         * @param charsetName
177         *            charset to use for encoding
178         * @return encoded string
179         */
180        public String encode(final String s, final String charsetName)
181        {
182                Args.notNull(charsetName, "charsetName");
183
184                try
185                {
186                        return encode(s, Charset.forName(charsetName));
187                }
188                catch (IllegalCharsetNameException | UnsupportedCharsetException e)
189                {
190                        throw new RuntimeException(new UnsupportedEncodingException(charsetName));
191                }
192        }
193
194        /**
195         * @param unsafeInput
196         *            string to encode
197         * @param charset
198         *            encoding to use
199         * @return encoded string
200         */
201        public String encode(final String unsafeInput, final Charset charset)
202        {
203                if (unsafeInput == null || unsafeInput.isEmpty())
204                {
205                        return unsafeInput;
206                }
207
208                Args.notNull(charset, "charset");
209
210                final byte[] bytes = unsafeInput.getBytes(charset);
211                boolean original = true;
212                for (final byte b : bytes)
213                {
214                        if (!type.isAllowed(b) || b == ' ' || b == '\0')
215                        {
216                                original = false;
217                                break;
218                        }
219                }
220                if (original)
221                {
222                        return unsafeInput;
223                }
224
225                final ByteArrayOutputStream bos = new ByteArrayOutputStream(bytes.length);
226                for (final byte b : bytes)
227                {
228                        if (type.isAllowed(b))
229                        {
230                                if (b == ' ')
231                                {
232                                        bos.write('+');
233                                }
234                                else
235                                {
236                                        bos.write(b);
237                                }
238                        }
239                        else
240                        {
241                                if (b == '\0')
242                                {
243                                        bos.writeBytes("NULL".getBytes(charset));
244                                }
245                                else
246                                {
247                                        bos.write('%');
248                                        bos.write(Character.toUpperCase(Character.forDigit((b >> 4) & 0xF, 16)));
249                                        bos.write(Character.toUpperCase(Character.forDigit(b & 0xF, 16)));
250                                }
251                        }
252                }
253                return bos.toString(charset);
254        }
255
256}