View Javadoc

1   /*
2    * $Header: /home/projects/jaxen/scm/jaxen/src/java/main/org/jaxen/saxpath/base/XPathLexer.java,v 1.5 2005/03/23 13:11:18 elharo Exp $
3    * $Revision: 1.5 $
4    * $Date: 2005/03/23 13:11:18 $
5    *
6    * ====================================================================
7    *
8    * Copyright (C) 2000-2002 bob mcwhirter & James Strachan.
9    * All rights reserved.
10   *
11   * Redistribution and use in source and binary forms, with or without
12   * modification, are permitted provided that the following conditions
13   * are met:
14   *
15   * 1. Redistributions of source code must retain the above copyright
16   *    notice, this list of conditions, and the following disclaimer.
17   *
18   * 2. Redistributions in binary form must reproduce the above copyright
19   *    notice, this list of conditions, and the disclaimer that follows
20   *    these conditions in the documentation and/or other materials
21   *    provided with the distribution.
22   *
23   * 3. The name "Jaxen" must not be used to endorse or promote products
24   *    derived from this software without prior written permission.  For
25   *    written permission, please contact license@jaxen.org.
26   *
27   * 4. Products derived from this software may not be called "Jaxen", nor
28   *    may "Jaxen" appear in their name, without prior written permission
29   *    from the Jaxen Project Management (pm@jaxen.org).
30   *
31   * In addition, we request (but do not require) that you include in the
32   * end-user documentation provided with the redistribution and/or in the
33   * software itself an acknowledgement equivalent to the following:
34   *     "This product includes software developed by the
35   *      Jaxen Project (http://www.jaxen.org/)."
36   * Alternatively, the acknowledgment may be graphical using the logos
37   * available at http://www.jaxen.org/
38   *
39   * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
40   * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
41   * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
42   * DISCLAIMED.  IN NO EVENT SHALL THE Jaxen AUTHORS OR THE PROJECT
43   * CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
44   * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
45   * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
46   * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
47   * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
48   * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
49   * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50   * SUCH DAMAGE.
51   *
52   * ====================================================================
53   * This software consists of voluntary contributions made by many
54   * individuals on behalf of the Jaxen Project and was originally
55   * created by bob mcwhirter <bob@werken.com> and
56   * James Strachan <jstrachan@apache.org>.  For more information on the
57   * Jaxen Project, please see <http://www.jaxen.org/>.
58   *
59   * $Id: XPathLexer.java,v 1.5 2005/03/23 13:11:18 elharo Exp $
60   */
61  
62  
63  
64  
65  package org.jaxen.saxpath.base;
66  
67  class XPathLexer
68  {
69      private String xpath;
70      private int    currentPosition;
71      private int    endPosition;
72  
73      private Token  previousToken;
74  
75      public XPathLexer(String xpath)
76      {
77          setXPath( xpath );
78      }
79  
80      XPathLexer()
81      {
82      }
83  
84      void setXPath(String xpath)
85      {
86          this.xpath           = xpath;
87          this.currentPosition = 0;
88          this.endPosition     = xpath.length();
89      }
90  
91      public String getXPath()
92      {
93          return this.xpath;
94      }
95  
96      public Token nextToken()
97      {
98          Token token = null;
99  
100         do
101         {
102             token = null;
103 
104             switch ( LA(1) )
105             {
106                 case '$':
107                 {
108                     token = dollar();
109                     break;
110                 }
111                     
112                 case '"':
113                 case '\'':
114                 {
115                     token = literal();
116                     break;
117                 }
118                     
119                 case '/':
120                 {
121                     token = slashes();
122                     break;
123                 }
124 
125                 case ',':
126                 {
127                     token = comma();
128                     break;
129                 }
130                     
131                 case '(':
132                 {
133                     token = leftParen();
134                     break;
135                 }
136                     
137                 case ')':
138                 {
139                     token = rightParen();
140                     break;
141                 }
142                     
143                 case '[':
144                 {
145                     token = leftBracket();
146                     break;
147                 }
148                     
149                 case ']':
150                 {
151                     token = rightBracket();
152                     break;
153                 }
154                     
155                 case '+':
156                 {
157                     token = plus();
158                     break;
159                 }
160                     
161                 case '-':
162                 {
163                     token = minus();
164                     break;
165                 }
166                     
167                 case '<':
168                 case '>':
169                 {
170                     token = relationalOperator();
171                     break;
172                 }        
173 
174                 case '=':
175                 {
176                     token = equals();
177                     break;
178                 }
179                     
180                 case '!':
181                 {
182                     if ( LA(2) == '=' )
183                     {
184                         token = notEquals();
185                     }
186                     else
187                     {
188                         token = not();
189                     }
190                     break;
191                 }
192                     
193                 case '|':
194                 {
195                     token = pipe();
196                     break;
197                 }
198                     
199                 case '@':
200                 {
201                     token = at();
202                     break;
203                 }
204                     
205                 case ':':
206                 {
207                     if ( LA(2) == ':' )
208                     {
209                         token = doubleColon();
210                     }
211                     else
212                     {
213                         token = colon();
214                     }
215                     break;
216                 }
217                     
218                 case '*':
219                 {
220                     token = star();
221                     break;
222                 }
223                     
224                 case '.':
225                 {
226                     switch ( LA(2) )
227                     {
228                         case '0':
229                         case '1':
230                         case '2':
231                         case '3':
232                         case '4':
233                         case '5':
234                         case '6':
235                         case '7':
236                         case '8':
237                         case '9':
238                         {
239                             token = number();
240                             break;
241                         }
242                         default:
243                         {
244                             token = dots();
245                             break;
246                         }
247                     }
248                     break;
249                 }
250 
251                 case '0':
252                 case '1':
253                 case '2':
254                 case '3':
255                 case '4':
256                 case '5':
257                 case '6':
258                 case '7':
259                 case '8':
260                 case '9':
261                 {
262                     token = number();
263                     break;
264                 }
265 
266                 case ' ':
267                 case '\t':
268                 case '\n':
269                 case '\r':
270                 {
271                     token = whitespace();
272                     break;
273                 }
274                     
275                 default:
276                 {
277                     if ( isIdentifierStartChar( LA(1) ) )
278                     {
279                         token = identifierOrOperatorName();
280                     }
281                 }
282             }
283 
284             if ( token == null )
285             {
286                 if (!hasMoreChars())
287                 {
288                     token = new Token( TokenTypes.EOF,
289                                    getXPath(),
290                                    currentPosition(),
291                                    endPosition() );
292             }
293                 else
294                 {
295                     token = new Token( TokenTypes.ERROR,
296                                    getXPath(),
297                                    currentPosition(),
298                                    endPosition() );
299                 }
300             }
301 
302         }
303         while ( token.getTokenType() == TokenTypes.SKIP );
304 
305         setPreviousToken( token );
306         
307         return token;
308     }
309 
310 Token identifierOrOperatorName()
311 {
312     Token token = null;
313 
314     Token previousToken = getPreviousToken();
315 
316     if ( previousToken != null )
317     {
318         // For some reason, section 3.7, Lexical structure,
319         // doesn't seem to feel like it needs to mention the
320         // SLASH, DOUBLE_SLASH, and COLON tokens for the test
321         // if an NCName is an operator or not.
322         //
323         // According to section 3.7, "/foo" should be considered
324         // as a SLASH following by an OperatorName being 'foo'.
325         // Which is just simply, clearly, wrong, in my mind.
326         //
327         //     -bob
328         
329         switch ( previousToken.getTokenType() )
330         {
331             case TokenTypes.AT:
332             case TokenTypes.DOUBLE_COLON:
333             case TokenTypes.LEFT_PAREN:
334             case TokenTypes.LEFT_BRACKET:
335             case TokenTypes.AND:
336             case TokenTypes.OR:
337             case TokenTypes.MOD:
338             case TokenTypes.DIV:
339             case TokenTypes.COLON:
340             case TokenTypes.SLASH:
341             case TokenTypes.DOUBLE_SLASH:
342             case TokenTypes.PIPE:
343             case TokenTypes.DOLLAR:
344             case TokenTypes.PLUS:
345             case TokenTypes.MINUS:
346             case TokenTypes.STAR:
347             case TokenTypes.COMMA:
348             case TokenTypes.LESS_THAN:
349             case TokenTypes.GREATER_THAN:
350             case TokenTypes.LESS_THAN_EQUALS:
351             case TokenTypes.GREATER_THAN_EQUALS:
352             case TokenTypes.EQUALS:
353             case TokenTypes.NOT_EQUALS:
354             {
355                 token = identifier();
356                 break;
357             }
358             default:
359             {
360                 token = operatorName();
361                 break;
362             }
363         }
364     }
365     else
366     {
367         token = identifier();
368     }
369 
370     return token;
371 }
372 
373 Token identifier()
374 {
375     Token token = null;
376 
377     int start = currentPosition();
378 
379     while ( hasMoreChars() )
380     {
381         if ( isIdentifierChar( LA(1) ) )
382         {
383             consume();
384         }
385         else
386         {
387             break;
388         }
389     }
390 
391     token = new Token( TokenTypes.IDENTIFIER,
392                        getXPath(),
393                        start,
394                        currentPosition() );
395 
396     return token;
397 }
398 
399 Token operatorName()
400 {
401     Token token = null;
402 
403     switch ( LA(1) )
404     {
405         case 'a':
406         {
407             token = and();
408             break;
409         }
410 
411         case 'o':
412         {
413             token = or();
414             break;
415         }
416 
417         case 'm':
418         {
419             token = mod();
420             break;
421         }
422 
423         case 'd':
424         {
425             token = div();
426             break;
427         }
428     }
429 
430     return token;
431 }
432 
433 Token mod()
434 {
435     Token token = null;
436 
437     if ( ( LA(1) == 'm' )
438          &&
439          ( LA(2) == 'o' )
440          &&
441          ( LA(3) == 'd' ) 
442          &&
443          ( ! isIdentifierChar( LA(4) ) ) )
444     {
445         token = new Token( TokenTypes.MOD,
446                            getXPath(),
447                            currentPosition(),
448                            currentPosition()+3 );
449 
450         consume();
451         consume();
452         consume();
453     }
454 
455     return token;
456 }
457 
458 Token div()
459 {
460     Token token = null;
461 
462     if ( ( LA(1) == 'd' )
463          &&
464          ( LA(2) == 'i' )
465          &&
466          ( LA(3) == 'v' ) 
467          &&
468          ( ! isIdentifierChar( LA(4) ) ) )
469     {
470         token = new Token( TokenTypes.DIV,
471                            getXPath(),
472                            currentPosition(),
473                            currentPosition()+3 );
474 
475         consume();
476         consume();
477         consume();
478     }
479 
480     return token;
481 }
482 
483 Token and()
484 {
485     Token token = null;
486 
487     if ( ( LA(1) == 'a' )
488          &&
489          ( LA(2) == 'n' )
490          &&
491          ( LA(3) == 'd' )
492          &&
493          ( ! isIdentifierChar( LA(4) ) ) )
494     {
495         token = new Token( TokenTypes.AND,
496                            getXPath(),
497                            currentPosition(),
498                            currentPosition()+3 );
499 
500         consume();
501         consume();
502         consume();
503     }
504 
505     return token;
506 }
507 
508 Token or()
509 {
510     Token token = null;
511 
512     if ( ( LA(1) == 'o' )
513          &&
514          ( LA(2) == 'r' ) 
515          &&
516          ( ! isIdentifierChar( LA(3) ) ) )
517     {
518         token = new Token( TokenTypes.OR,
519                            getXPath(),
520                            currentPosition(),
521                            currentPosition()+2 );
522 
523         consume();
524         consume();
525     }
526 
527     return token;
528 }
529 
530 Token number()
531 {
532     int     start         = currentPosition();
533     boolean periodAllowed = true;
534 
535   loop:
536     while( true )
537     {
538         switch ( LA(1) )
539         {
540             case '.':
541             {
542                 if ( periodAllowed )
543                 {
544                     periodAllowed = false;
545                     consume();
546                 }
547                 else
548                 {
549                     break loop;
550                 }
551                 break;
552             }
553             
554             case '0':
555             case '1':
556             case '2':
557             case '3':
558             case '4':
559             case '5':
560             case '6':
561             case '7':
562             case '8':
563             case '9':
564             {
565                 consume();
566                 break;
567             }
568             default:
569             {
570                 break loop;
571             }
572         }
573     }
574 
575     Token token = null;
576 
577     if ( periodAllowed )
578     {
579         token = new Token( TokenTypes.INTEGER,
580                            getXPath(),
581                            start,
582                            currentPosition() );
583     }
584     else
585     {
586         token = new Token( TokenTypes.DOUBLE,
587                            getXPath(),
588                            start,
589                            currentPosition() );
590     }
591 
592     return token;
593 }
594 
595 Token whitespace()
596 {
597     consume();
598         
599   loop:
600     while( hasMoreChars() )
601     {
602         switch ( LA(1) )
603         {
604             case ' ':
605             case '\t':
606             case '\n':
607             case '\r':
608             {
609                 consume();
610                 break;
611             }
612                 
613             default:
614             {
615                 break loop;
616             }
617         }
618     }
619 
620     return new Token( TokenTypes.SKIP,
621                       getXPath(),
622                       0,
623                       0 );
624 }
625 
626 Token comma()
627 {
628     Token token = new Token( TokenTypes.COMMA,
629                              getXPath(),
630                              currentPosition(),
631                              currentPosition()+1 );
632 
633     consume();
634 
635     return token;
636 }
637 
638 Token equals()
639 {
640     Token token = new Token( TokenTypes.EQUALS,
641                              getXPath(),
642                              currentPosition(),
643                              currentPosition()+1 );
644 
645     consume();
646 
647     return token;
648 }
649 
650 Token minus()
651 {
652     Token token = new Token( TokenTypes.MINUS,
653                              getXPath(),
654                              currentPosition(),
655                              currentPosition()+1 );
656     consume();
657         
658     return token;
659 }
660 
661 Token plus()
662 {
663     Token token = new Token( TokenTypes.PLUS,
664                              getXPath(),
665                              currentPosition(),
666                              currentPosition()+1 );
667     consume();
668 
669     return token;
670 }
671 
672 Token dollar()
673 {
674     Token token = new Token( TokenTypes.DOLLAR,
675                              getXPath(),
676                              currentPosition(),
677                              currentPosition()+1 );
678     consume();
679 
680     return token;
681 }
682 
683 Token pipe()
684 {
685     Token token = new Token( TokenTypes.PIPE,
686                              getXPath(),
687                              currentPosition(),
688                              currentPosition()+1 );
689 
690     consume();
691 
692     return token;
693 }
694 
695 Token at()
696 {
697     Token token = new Token( TokenTypes.AT,
698                              getXPath(),
699                              currentPosition(),
700                              currentPosition()+1 );
701 
702     consume();
703 
704     return token;
705 }
706 
707 Token colon()
708 {
709     Token token = new Token( TokenTypes.COLON,
710                              getXPath(),
711                              currentPosition(),
712                              currentPosition()+1 );
713     consume();
714 
715     return token;
716 }
717 
718 Token doubleColon()
719 {
720     Token token = new Token( TokenTypes.DOUBLE_COLON,
721                              getXPath(),
722                              currentPosition(),
723                              currentPosition()+2 );
724 
725     consume();
726     consume();
727 
728     return token;
729 }
730 
731 Token not()
732 {
733     Token token = new Token( TokenTypes.NOT,
734                              getXPath(),
735                              currentPosition(),
736                              currentPosition() + 1 );
737 
738     consume();
739 
740     return token;
741 }
742 
743 Token notEquals()
744 {
745     Token token = new Token( TokenTypes.NOT_EQUALS,
746                              getXPath(),
747                              currentPosition(),
748                              currentPosition() + 2 );
749 
750     consume();
751     consume();
752 
753     return token;
754 }
755 
756 Token relationalOperator()
757 {
758     Token token = null;
759 
760     switch ( LA(1) )
761     {
762         case '<':
763         {
764             if ( LA(2) == '=' )
765             {
766                 token = new Token( TokenTypes.LESS_THAN_EQUALS,
767                                    getXPath(),
768                                    currentPosition(),
769                                    currentPosition() + 2 );
770                 consume();
771             }
772             else
773             {
774                 token = new Token( TokenTypes.LESS_THAN,
775                                    getXPath(),
776                                    currentPosition(),
777                                    currentPosition() + 1);
778             }
779 
780             consume();
781             break;
782         }
783         case '>':
784         {
785             if ( LA(2) == '=' )
786             {
787                 token = new Token( TokenTypes.GREATER_THAN_EQUALS,
788                                    getXPath(),
789                                    currentPosition(),
790                                    currentPosition() + 2 );
791                 consume();
792             }
793             else
794             {
795                 token = new Token( TokenTypes.GREATER_THAN,
796                                    getXPath(),
797                                    currentPosition(),
798                                    currentPosition() + 1 );
799             }
800 
801             consume();
802             break;
803         }
804     }
805 
806     return token;
807             
808 }
809 
810 Token star()
811 {
812     Token token = new Token( TokenTypes.STAR,
813                              getXPath(),
814                              currentPosition(),
815                              currentPosition()+1 );
816 
817     consume();
818         
819     return token;
820 }
821 
822 Token literal()
823 {
824     Token token = null;
825 
826     char match  = LA(1);
827 
828     consume();
829 
830     int start = currentPosition();
831         
832     while ( ( token == null )
833             &&
834             hasMoreChars() )
835     {
836         if ( LA(1) == match )
837         {
838             token = new Token( TokenTypes.LITERAL,
839                                getXPath(),
840                                start,
841                                currentPosition() );
842         }
843         consume();
844     }
845 
846     return token;
847 }
848 
849 Token dots()
850 {
851     Token token = null;
852 
853     switch ( LA(2) )
854     {
855         case '.':
856         {
857             token = new Token( TokenTypes.DOT_DOT,
858                                getXPath(),
859                                currentPosition(),
860                                currentPosition()+2 ) ;
861             consume();
862             consume();
863             break;
864         }
865         default:
866         {
867             token = new Token( TokenTypes.DOT,
868                                getXPath(),
869                                currentPosition(),
870                                currentPosition()+1 );
871             consume();
872             break;
873         }
874     }
875 
876     return token;
877 }
878 
879 Token leftBracket()
880 {
881     Token token = new Token( TokenTypes.LEFT_BRACKET,
882                              getXPath(),
883                              currentPosition(),
884                              currentPosition()+1 );
885 
886     consume();
887 
888     return token;
889 }
890 
891 Token rightBracket()
892 {
893     Token token = new Token( TokenTypes.RIGHT_BRACKET,
894                              getXPath(),
895                              currentPosition(),
896                              currentPosition()+1 );
897 
898     consume();
899 
900     return token;
901 }
902 
903 Token leftParen()
904 {
905     Token token = new Token( TokenTypes.LEFT_PAREN,
906                              getXPath(),
907                              currentPosition(),
908                              currentPosition()+1 );
909 
910     consume();
911 
912     return token;
913 }
914 
915 Token rightParen()
916 {
917     Token token = new Token( TokenTypes.RIGHT_PAREN,
918                              getXPath(),
919                              currentPosition(),
920                              currentPosition()+1 );
921 
922     consume();
923 
924     return token;
925 }
926 
927 Token slashes()
928 {
929     Token token = null;
930 
931     switch ( LA(2) )
932     {
933         case '/':
934         {
935             token = new Token( TokenTypes.DOUBLE_SLASH,
936                                getXPath(),
937                                currentPosition(),
938                                currentPosition()+2 );
939             consume();
940             consume();
941             break;
942         }
943         default:
944         {
945             token = new Token( TokenTypes.SLASH,
946                                getXPath(),
947                                currentPosition(),
948                                currentPosition()+1 );
949             consume();
950         }
951     }
952 
953     return token;
954 }
955 
956 char LA(int i) 
957 {
958     if ( currentPosition + ( i - 1 ) >= endPosition() )
959     {
960         return (char) -1;
961     }
962 
963     return getXPath().charAt( currentPosition() + (i - 1) );
964 }
965 
966 void consume()
967 {
968     ++this.currentPosition;
969 }
970 
971 void consume(int i)
972 {
973     this.currentPosition += i;
974 }
975 
976 int currentPosition()
977 {
978     return this.currentPosition;
979 }
980 
981 int endPosition()
982 {
983     return this.endPosition;
984 }
985 
986 Token getPreviousToken()
987 {
988     return this.previousToken;
989 }
990 
991 void setPreviousToken(Token previousToken)
992 {
993     this.previousToken = previousToken;
994 }
995 
996 boolean hasMoreChars()
997 {
998     return currentPosition() < endPosition();
999 }
1000 
1001 boolean isIdentifierChar(char c)
1002 {
1003     return Verifier.isXMLNCNameCharacter( c );
1004 }
1005 
1006 boolean isIdentifierStartChar(char c)
1007 {
1008     return Verifier.isXMLNCNameStartCharacter( c );
1009 }
1010 
1011 }