diff options
Diffstat (limited to 'libjava/classpath/gnu/regexp/RE.java')
-rw-r--r-- | libjava/classpath/gnu/regexp/RE.java | 190 |
1 files changed, 161 insertions, 29 deletions
diff --git a/libjava/classpath/gnu/regexp/RE.java b/libjava/classpath/gnu/regexp/RE.java index ef606a6..e0665f0 100644 --- a/libjava/classpath/gnu/regexp/RE.java +++ b/libjava/classpath/gnu/regexp/RE.java @@ -41,6 +41,7 @@ import java.io.Serializable; import java.util.Locale; import java.util.PropertyResourceBundle; import java.util.ResourceBundle; +import java.util.Stack; import java.util.Vector; /** @@ -78,13 +79,18 @@ import java.util.Vector; * <P> * * These methods all have similar argument lists. The input can be a - * String, a character array, a StringBuffer, or an + * CharIndexed, String, a character array, a StringBuffer, or an * InputStream of some sort. Note that when using an * InputStream, the stream read position cannot be guaranteed after * attempting a match (this is not a bug, but a consequence of the way * regular expressions work). Using an REMatchEnumeration can * eliminate most positioning problems. * + * Although the input object can be of various types, it is recommended + * that it should be a CharIndexed because {@link CharIndexed#getLastMatch()} + * can show the last match found on this input, which helps the expression + * \G work as the end of the previous match. + * * <P> * * The optional index argument specifies the offset from the beginning @@ -235,6 +241,17 @@ public class RE extends REToken { */ public static final int REG_REPLACE_USE_BACKSLASHESCAPE = 0x0200; + /** + * Compilation flag. Allow whitespace and comments in pattern. + * This is equivalent to the "/x" operator in Perl. + */ + public static final int REG_X_COMMENTS = 0x0400; + + /** + * Compilation flag. If set, REG_ICASE is effective only for US-ASCII. + */ + public static final int REG_ICASE_USASCII = 0x0800; + /** Returns a string representing the version of the gnu.regexp package. */ public static final String version() { return VERSION; @@ -334,6 +351,7 @@ public class RE extends REToken { // Precalculate these so we don't pay for the math every time we // need to access them. boolean insens = ((cflags & REG_ICASE) > 0); + boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); // Parse pattern into tokens. Does anyone know if it's more efficient // to use char[] than a String.charAt()? I'm assuming so. @@ -372,6 +390,31 @@ public class RE extends REToken { if (quot) unit.bk = false; + if (((cflags & REG_X_COMMENTS) > 0) && (!unit.bk) && (!quot)) { + if (Character.isWhitespace(unit.ch)) { + continue; + } + if (unit.ch == '#') { + for (int i = index; i < pLength; i++) { + if (pattern[i] == '\n') { + index = i + 1; + continue; + } + else if (pattern[i] == '\r') { + if (i + 1 < pLength && pattern[i + 1] == '\n') { + index = i + 2; + } + else { + index = i + 1; + } + continue; + } + } + index = pLength; + continue; + } + } + // ALTERNATION OPERATOR // \| or | (if RE_NO_BK_VBAR) or newline (if RE_NEWLINE_ALT) // not available if RE_LIMITED_OPS is set @@ -420,6 +463,7 @@ public class RE extends REToken { else { addToken(currentToken); currentToken = new RETokenChar(subIndex,unit.ch,insens); + if (insensUSASCII) currentToken.unicodeAware = false; } } @@ -495,8 +539,8 @@ public class RE extends REToken { case 'd': case 'm': case 's': - // case 'u': not supported - // case 'x': not supported + case 'u': + case 'x': case '-': if (!syntax.get(RESyntax.RE_EMBEDDED_FLAGS)) break; // Set or reset syntax flags. @@ -535,8 +579,20 @@ public class RE extends REToken { newCflags |= REG_DOT_NEWLINE; flagIndex++; break; - // case 'u': not supported - // case 'x': not supported + case 'u': + if (negate) + newCflags |= REG_ICASE_USASCII; + else + newCflags &= ~REG_ICASE_USASCII; + flagIndex++; + break; + case 'x': + if (negate) + newCflags &= ~REG_X_COMMENTS; + else + newCflags |= REG_X_COMMENTS; + flagIndex++; + break; case '-': negate = true; flagIndex++; @@ -553,6 +609,7 @@ public class RE extends REToken { syntax = newSyntax; cflags = newCflags; insens = ((cflags & REG_ICASE) > 0); + insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); // This can be treated as though it were a comment. comment = true; index = flagIndex - 1; @@ -565,6 +622,7 @@ public class RE extends REToken { syntax = newSyntax; cflags = newCflags; insens = ((cflags & REG_ICASE) > 0); + insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); index = flagIndex -1; // Fall through to the next case. } @@ -673,6 +731,7 @@ public class RE extends REToken { syntax = savedSyntax; cflags = savedCflags; insens = ((cflags & REG_ICASE) > 0); + insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); flagsSaved = false; } } // not a comment @@ -785,6 +844,7 @@ public class RE extends REToken { index = index - 2 + ce.len; addToken(currentToken); currentToken = new RETokenChar(subIndex,ce.ch,insens); + if (insensUSASCII) currentToken.unicodeAware = false; } // BACKREFERENCE OPERATOR @@ -812,6 +872,7 @@ public class RE extends REToken { int num = parseInt(pattern, numBegin, numEnd-numBegin, 10); currentToken = new RETokenBackRef(subIndex,num,insens); + if (insensUSASCII) currentToken.unicodeAware = false; index = numEnd; } @@ -860,6 +921,7 @@ public class RE extends REToken { else if (unit.bk && (unit.ch == 'd') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { addToken(currentToken); currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,false); + if (insensUSASCII) currentToken.unicodeAware = false; } // NON-DIGIT OPERATOR @@ -868,6 +930,7 @@ public class RE extends REToken { else if (unit.bk && (unit.ch == 'D') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { addToken(currentToken); currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.DIGIT,insens,true); + if (insensUSASCII) currentToken.unicodeAware = false; } // NEWLINE ESCAPE @@ -892,6 +955,7 @@ public class RE extends REToken { else if (unit.bk && (unit.ch == 's') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { addToken(currentToken); currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,false); + if (insensUSASCII) currentToken.unicodeAware = false; } // NON-WHITESPACE OPERATOR @@ -900,6 +964,7 @@ public class RE extends REToken { else if (unit.bk && (unit.ch == 'S') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { addToken(currentToken); currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.SPACE,insens,true); + if (insensUSASCII) currentToken.unicodeAware = false; } // TAB ESCAPE @@ -916,6 +981,7 @@ public class RE extends REToken { else if (unit.bk && (unit.ch == 'w') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { addToken(currentToken); currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,false); + if (insensUSASCII) currentToken.unicodeAware = false; } // NON-ALPHANUMERIC OPERATOR @@ -924,12 +990,19 @@ public class RE extends REToken { else if (unit.bk && (unit.ch == 'W') && syntax.get(RESyntax.RE_CHAR_CLASS_ESCAPES)) { addToken(currentToken); currentToken = new RETokenPOSIX(subIndex,RETokenPOSIX.ALNUM,insens,true); + if (insensUSASCII) currentToken.unicodeAware = false; } // END OF STRING OPERATOR - // \Z + // \Z, \z - else if (unit.bk && (unit.ch == 'Z') && syntax.get(RESyntax.RE_STRING_ANCHORS)) { + // FIXME: \Z and \z are different in that if the input string + // ends with a line terminator, \Z matches the position before + // the final terminator. This special behavior of \Z is yet + // to be implemented. + + else if (unit.bk && (unit.ch == 'Z' || unit.ch == 'z') && + syntax.get(RESyntax.RE_STRING_ANCHORS)) { addToken(currentToken); currentToken = new RETokenEnd(subIndex,null); } @@ -945,6 +1018,7 @@ public class RE extends REToken { index = index - 2 + ce.len; addToken(currentToken); currentToken = new RETokenChar(subIndex,ce.ch,insens); + if (insensUSASCII) currentToken.unicodeAware = false; } // NAMED PROPERTY @@ -958,6 +1032,16 @@ public class RE extends REToken { index = index - 2 + np.len; addToken(currentToken); currentToken = getRETokenNamedProperty(subIndex,np,insens,index); + if (insensUSASCII) currentToken.unicodeAware = false; + } + + // END OF PREVIOUS MATCH + // \G + + else if (unit.bk && (unit.ch == 'G') && + syntax.get(RESyntax.RE_STRING_ANCHORS)) { + addToken(currentToken); + currentToken = new RETokenEndOfPreviousMatch(subIndex); } // NON-SPECIAL CHARACTER (or escape to make literal) @@ -966,6 +1050,7 @@ public class RE extends REToken { else { // not a special character addToken(currentToken); currentToken = new RETokenChar(subIndex,unit.ch,insens); + if (insensUSASCII) currentToken.unicodeAware = false; } } // end while @@ -1006,6 +1091,7 @@ public class RE extends REToken { throws REException { boolean insens = ((cflags & REG_ICASE) > 0); + boolean insensUSASCII = ((cflags & REG_ICASE_USASCII) > 0); Vector options = new Vector(); Vector addition = new Vector(); boolean additionAndAppeared = false; @@ -1035,7 +1121,9 @@ public class RE extends REToken { if ((ch == '-') && (lastCharIsSet)) { if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); if ((ch = pattern[index]) == ']') { - options.addElement(new RETokenChar(subIndex,lastChar,insens)); + RETokenChar t = new RETokenChar(subIndex,lastChar,insens); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); lastChar = '-'; } else { if ((ch == '\\') && syntax.get(RESyntax.RE_BACKSLASH_ESCAPE_IN_LISTS)) { @@ -1045,7 +1133,9 @@ public class RE extends REToken { ch = ce.ch; index = index + ce.len - 1; } - options.addElement(new RETokenRange(subIndex,lastChar,ch,insens)); + RETokenRange t = new RETokenRange(subIndex,lastChar,ch,insens); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); lastChar = 0; lastCharIsSet = false; index++; } @@ -1088,12 +1178,20 @@ public class RE extends REToken { asciiEsc = ce.ch; asciiEscIsSet = true; index = index - 1 + ce.len - 1; } - if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + if (lastCharIsSet) { + RETokenChar t = new RETokenChar(subIndex,lastChar,insens); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); + } if (posixID != -1) { - options.addElement(new RETokenPOSIX(subIndex,posixID,insens,negate)); + RETokenPOSIX t = new RETokenPOSIX(subIndex,posixID,insens,negate); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); } else if (np != null) { - options.addElement(getRETokenNamedProperty(subIndex,np,insens,index)); + RETokenNamedProperty t = getRETokenNamedProperty(subIndex,np,insens,index); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); } else if (asciiEscIsSet) { lastChar = asciiEsc; lastCharIsSet = true; } else { @@ -1104,8 +1202,11 @@ public class RE extends REToken { StringBuffer posixSet = new StringBuffer(); index = getPosixSet(pattern,index+1,posixSet); int posixId = RETokenPOSIX.intValue(posixSet.toString()); - if (posixId != -1) - options.addElement(new RETokenPOSIX(subIndex,posixId,insens,false)); + if (posixId != -1) { + RETokenPOSIX t = new RETokenPOSIX(subIndex,posixId,insens,false); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); + } } else if ((ch == '[') && (syntax.get(RESyntax.RE_NESTED_CHARCLASS))) { ParseCharClassResult result = parseCharClass( subIndex, pattern, index, pLength, cflags, syntax, 0); @@ -1158,14 +1259,22 @@ public class RE extends REToken { result.index: result.index - 1); } } else { - if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + if (lastCharIsSet) { + RETokenChar t = new RETokenChar(subIndex,lastChar,insens); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); + } lastChar = ch; lastCharIsSet = true; } if (index == pLength) throw new REException(getLocalizedMessage("class.no.end"),REException.REG_EBRACK,index); } // while in list // Out of list, index is one past ']' - if (lastCharIsSet) options.addElement(new RETokenChar(subIndex,lastChar,insens)); + if (lastCharIsSet) { + RETokenChar t = new RETokenChar(subIndex,lastChar,insens); + if (insensUSASCII) t.unicodeAware = false; + options.addElement(t); + } ParseCharClassResult result = new ParseCharClassResult(); // Create a new RETokenOneOf @@ -1396,11 +1505,10 @@ public class RE extends REToken { return (input.charAt(0) == CharIndexed.OUT_OF_BOUNDS); REMatch m = new REMatch(numSubs, index, eflags); if (firstToken.match(input, m)) { - while (m != null) { + if (m != null) { if (input.charAt(m.index) == CharIndexed.OUT_OF_BOUNDS) { return true; } - m = m.next; } } return false; @@ -1508,17 +1616,27 @@ public class RE extends REToken { } /* Implements abstract method REToken.match() */ - boolean match(CharIndexed input, REMatch mymatch) { + boolean match(CharIndexed input, REMatch mymatch) { if (firstToken == null) { return next(input, mymatch); } // Note the start of this subexpression - mymatch.start[subIndex] = mymatch.index; + mymatch.start1[subIndex] = mymatch.index; return firstToken.match(input, mymatch); } - + + REMatch findMatch(CharIndexed input, REMatch mymatch) { + if (mymatch.backtrackStack == null) + mymatch.backtrackStack = new BacktrackStack(); + boolean b = match(input, mymatch); + if (b) { + return mymatch; + } + return null; + } + /** * Returns the first match found in the input. If no match is found, * null is returned. @@ -1602,6 +1720,7 @@ public class RE extends REToken { */ best.end[0] = best.index; best.finish(input); + input.setLastMatch(best); return best; } } @@ -1942,19 +2061,34 @@ public class RE extends REToken { } void dump(StringBuffer os) { - os.append('('); + os.append("(?#startRE subIndex=" + subIndex + ")"); if (subIndex == 0) os.append("?:"); if (firstToken != null) firstToken.dumpAll(os); - os.append(')'); + if (subIndex == 0) + os.append(")"); + os.append("(?#endRE subIndex=" + subIndex + ")"); } // Cast input appropriately or throw exception - private static CharIndexed makeCharIndexed(Object input, int index) { - // We could let a String fall through to final input, but since - // it's the most likely input type, we check it first. - if (input instanceof String) + // This method was originally a private method, but has been made + // public because java.util.regex.Matcher uses this. + public static CharIndexed makeCharIndexed(Object input, int index) { + // The case where input is already a CharIndexed is supposed + // be the most likely because this is the case with + // java.util.regex.Matcher. + // We could let a String or a CharSequence fall through + // to final input, but since it'a very likely input type, + // we check it first. + if (input instanceof CharIndexed) { + CharIndexed ci = (CharIndexed) input; + ci.setAnchor(index); + return ci; + } + else if (input instanceof CharSequence) + return new CharIndexedCharSequence((CharSequence) input,index); + else if (input instanceof String) return new CharIndexedString((String) input,index); else if (input instanceof char[]) return new CharIndexedCharArray((char[]) input,index); @@ -1962,8 +2096,6 @@ public class RE extends REToken { return new CharIndexedStringBuffer((StringBuffer) input,index); else if (input instanceof InputStream) return new CharIndexedInputStream((InputStream) input,index); - else if (input instanceof CharIndexed) - return (CharIndexed) input; // do we lose index info? else return new CharIndexedString(input.toString(), index); } |