aboutsummaryrefslogtreecommitdiff
path: root/libjava/java
diff options
context:
space:
mode:
authorMichael Koch <konqueror@gmx.de>2004-05-31 22:16:31 +0000
committerMichael Koch <mkoch@gcc.gnu.org>2004-05-31 22:16:31 +0000
commit57807c317869610e07a85bf8bf95747638230ed7 (patch)
tree84eadd76d1df1c8b46bd05dd00699bc802f78611 /libjava/java
parentf7dbd56c9a1e2605f54a05c2f613cf086f32634b (diff)
downloadgcc-57807c317869610e07a85bf8bf95747638230ed7.zip
gcc-57807c317869610e07a85bf8bf95747638230ed7.tar.gz
gcc-57807c317869610e07a85bf8bf95747638230ed7.tar.bz2
CollationElementIterator.java, [...]: New versions from GNU classpath.
2004-06-01 Michael Koch <konqueror@gmx.de> * java/text/CollationElementIterator.java, java/text/CollationKey.java, java/text/RuleBasedCollator.java: New versions from GNU classpath. * testsuite/libjava.mauve/xfails: Removed all java.text.CollationElementIterator tests. From-SVN: r82510
Diffstat (limited to 'libjava/java')
-rw-r--r--libjava/java/text/CollationElementIterator.java269
-rw-r--r--libjava/java/text/CollationKey.java28
-rw-r--r--libjava/java/text/RuleBasedCollator.java925
3 files changed, 935 insertions, 287 deletions
diff --git a/libjava/java/text/CollationElementIterator.java b/libjava/java/text/CollationElementIterator.java
index 1b5a617..a4c2100 100644
--- a/libjava/java/text/CollationElementIterator.java
+++ b/libjava/java/text/CollationElementIterator.java
@@ -38,6 +38,8 @@ exception statement from your version. */
package java.text;
+import java.util.Vector;
+
/* Written using "Java Class Libraries", 2nd edition, plus online
* API docs for JDK 1.2 from http://www.javasoft.com.
* Status: Believed complete and correct to JDK 1.1.
@@ -74,13 +76,25 @@ public final class CollationElementIterator
String text;
/**
+ * This is the index into the collation decomposition where we are currently scanning.
+ */
+ int index;
+
+ /**
* This is the index into the String where we are currently scanning.
*/
int textIndex;
- // A piece of lookahead.
- boolean lookahead_set;
- int lookahead;
+ /**
+ * Array containing the collation decomposition of the
+ * text given to the constructor.
+ */
+ private Object[] text_decomposition;
+
+ /**
+ * Array containing the index of the specified block.
+ */
+ private int[] text_indexes;
/**
* This method initializes a new instance of <code>CollationElementIterator</code>
@@ -97,6 +111,35 @@ public final class CollationElementIterator
setText (text);
}
+ RuleBasedCollator.CollationElement nextBlock()
+ {
+ if (index >= text_decomposition.length)
+ return null;
+
+ RuleBasedCollator.CollationElement e =
+ (RuleBasedCollator.CollationElement) text_decomposition[index];
+
+ textIndex = text_indexes[index+1];
+
+ index++;
+
+ return e;
+ }
+
+ RuleBasedCollator.CollationElement previousBlock()
+ {
+ if (index == 0)
+ return null;
+
+ index--;
+ RuleBasedCollator.CollationElement e =
+ (RuleBasedCollator.CollationElement) text_decomposition[index];
+
+ textIndex = text_indexes[index+1];
+
+ return e;
+ }
+
/**
* This method returns the collation ordering value of the next character sequence
* in the string (it may be an extended character following collation rules).
@@ -107,10 +150,29 @@ public final class CollationElementIterator
*/
public int next()
{
- if (textIndex == text.length())
+ RuleBasedCollator.CollationElement e = nextBlock();
+
+ if (e == null)
return NULLORDER;
+
+ return e.getValue();
+ }
+
+ /**
+ * This method returns the collation ordering value of the previous character
+ * in the string. This method will return <code>NULLORDER</code> if the
+ * beginning of the string was reached.
+ *
+ * @return The collation ordering value.
+ */
+ public int previous()
+ {
+ RuleBasedCollator.CollationElement e = previousBlock();
- return collator.ceiNext (this);
+ if (e == null)
+ return NULLORDER;
+
+ return e.getValue();
}
/**
@@ -133,9 +195,8 @@ public final class CollationElementIterator
*/
public void reset()
{
+ index = 0;
textIndex = 0;
- lookahead_set = false;
- lookahead = 0;
}
/**
@@ -176,10 +237,152 @@ public final class CollationElementIterator
*/
public void setText(String text)
{
+ int idx = 0;
+ int idx_idx = 0;
+ int alreadyExpanded = 0;
+ int idxToMove = 0;
+
this.text = text;
- this.textIndex = 0;
- this.lookahead_set = false;
- this.lookahead = 0;
+ this.index = 0;
+
+ String work_text = text.intern();
+
+ Vector v = new Vector();
+ Vector vi = new Vector();
+
+ // Build element collection ordered as they come in "text".
+ while (idx < work_text.length())
+ {
+ String key, key_old;
+
+ Object object = null;
+ int p = 1;
+
+ // IMPROVE: use a TreeMap with a prefix-ordering rule.
+ key_old = key = null;
+ do
+ {
+ if (object != null)
+ key_old = key;
+ key = work_text.substring (idx, idx+p);
+ object = collator.prefix_tree.get (key);
+ if (object != null && idx < alreadyExpanded)
+ {
+ RuleBasedCollator.CollationElement prefix = (RuleBasedCollator.CollationElement)object;
+ if (prefix.expansion != null &&
+ prefix.expansion.startsWith(work_text.substring(0, idx)))
+ {
+ object = null;
+ key = key_old;
+ }
+ }
+ p++;
+ }
+ while (idx+p <= work_text.length());
+
+ if (object == null)
+ key = key_old;
+
+ RuleBasedCollator.CollationElement prefix =
+ (RuleBasedCollator.CollationElement) collator.prefix_tree.get (key);
+
+ /*
+ * First case: There is no such sequence in the database.
+ * We will have to build one from the context.
+ */
+ if (prefix == null)
+ {
+ /*
+ * We are dealing with sequences in an expansion. They
+ * are treated as accented characters (tertiary order).
+ */
+ if (alreadyExpanded > 0)
+ {
+ RuleBasedCollator.CollationElement e =
+ collator.getDefaultAccentedElement (work_text.charAt (idx));
+
+ v.add (e);
+ vi.add (new Integer(idx_idx));
+ idx++;
+ alreadyExpanded--;
+ if (alreadyExpanded == 0)
+ {
+ /* There is not any characters left in the expansion set.
+ * We can increase the pointer in the source string.
+ */
+ idx_idx += idxToMove;
+ idxToMove = 0;
+ }
+ else
+ idx_idx++;
+ }
+ else
+ {
+ /* This is a normal character. */
+ RuleBasedCollator.CollationElement e =
+ collator.getDefaultElement (work_text.charAt (idx));
+ Integer i_ref = new Integer(idx_idx);
+
+ /* Don't forget to mark it as a special sequence so the
+ * string can be ordered.
+ */
+ v.add (RuleBasedCollator.SPECIAL_UNKNOWN_SEQ);
+ vi.add (i_ref);
+ v.add (e);
+ vi.add (i_ref);
+ idx_idx++;
+ idx++;
+ }
+ continue;
+ }
+
+ /*
+ * Second case: Here we have found a matching sequence.
+ * Here we have an expansion string prepend it to the "work text" and
+ * add the corresponding sorting element. We must also mark
+ */
+ if (prefix.expansion != null)
+ {
+ work_text = prefix.expansion
+ + work_text.substring (idx+prefix.key.length());
+ idx = 0;
+ v.add (prefix);
+ vi.add (new Integer(idx_idx));
+ if (alreadyExpanded == 0)
+ idxToMove = prefix.key.length();
+ alreadyExpanded += prefix.expansion.length()-prefix.key.length();
+ }
+ else
+ {
+ /* Third case: the simplest. We have got the prefix and it
+ * has not to be expanded.
+ */
+ v.add (prefix);
+ vi.add (new Integer(idx_idx));
+ idx += prefix.key.length();
+ /* If the sequence is in an expansion, we must decrease the
+ * counter.
+ */
+ if (alreadyExpanded > 0)
+ {
+ alreadyExpanded -= prefix.key.length();
+ if (alreadyExpanded == 0)
+ {
+ idx_idx += idxToMove;
+ idxToMove = 0;
+ }
+ } else
+ idx_idx += prefix.key.length();
+ }
+ }
+
+ text_decomposition = v.toArray();
+ text_indexes = new int[vi.size()+1];
+ for (int i = 0; i < vi.size(); i++)
+ {
+ text_indexes[i] = ((Integer)vi.elementAt(i)).intValue();
+ }
+ text_indexes[vi.size()] = text.length();
}
/**
@@ -215,4 +418,50 @@ public final class CollationElementIterator
{
return textIndex;
}
+
+ /**
+ * This method sets the iteration index position into the current
+ * <code>String</code> to the specified value. This value must not
+ * be negative and must not be greater than the last index position
+ * in the <code>String</code>.
+ *
+ * @param offset The new iteration index position.
+ *
+ * @exception IllegalArgumentException If the new offset is not valid.
+ */
+ public void setOffset(int offset)
+ {
+ if (offset < 0)
+ throw new IllegalArgumentException("Negative offset: " + offset);
+
+ if (offset > (text.length() - 1))
+ throw new IllegalArgumentException("Offset too large: " + offset);
+
+ for (index = 0; index < text_decomposition.length; index++)
+ {
+ if (offset <= text_indexes[index])
+ break;
+ }
+ /*
+ * As text_indexes[0] == 0, we should not have to take care whether index is
+ * greater than 0. It is always.
+ */
+ if (text_indexes[index] == offset)
+ textIndex = offset;
+ else
+ textIndex = text_indexes[index-1];
+ }
+
+ /**
+ * This method returns the maximum length of any expansion sequence that
+ * ends with the specified collation order value. (Whatever that means).
+ *
+ * @param value The collation order value
+ *
+ * @param The maximum length of an expansion sequence.
+ */
+ public int getMaxExpansion(int value)
+ {
+ return 1;
+ }
}
diff --git a/libjava/java/text/CollationKey.java b/libjava/java/text/CollationKey.java
index abc28b2..4e6aca2 100644
--- a/libjava/java/text/CollationKey.java
+++ b/libjava/java/text/CollationKey.java
@@ -78,24 +78,13 @@ public final class CollationKey implements Comparable
/**
* This is the bit value for this key.
*/
- private int[] key;
+ private byte[] key;
- CollationKey(Collator collator, CollationElementIterator iter,
- String originalText, int strength)
+ CollationKey (Collator collator, String originalText, byte[] key)
{
this.collator = collator;
this.originalText = originalText;
-
- // Compute size of required array.
- int size = 0;
- while (RuleBasedCollator.next(iter, strength)
- != CollationElementIterator.NULLORDER)
- ++size;
-
- iter.reset();
- key = new int[size];
- for (int i = 0; i < size; i++)
- key[i] = RuleBasedCollator.next(iter, strength);
+ this.key = key;
}
/**
@@ -205,15 +194,6 @@ public final class CollationKey implements Comparable
*/
public byte[] toByteArray()
{
- byte[] r = new byte[4 * key.length];
- int off = 0;
- for (int i = 0; i < key.length; ++i)
- {
- r[off++] = (byte) ((key[i] >>> 24) & 255);
- r[off++] = (byte) ((key[i] >>> 16) & 255);
- r[off++] = (byte) ((key[i] >>> 8) & 255);
- r[off++] = (byte) ((key[i] ) & 255);
- }
- return r;
+ return key;
}
}
diff --git a/libjava/java/text/RuleBasedCollator.java b/libjava/java/text/RuleBasedCollator.java
index 64c2ac2..23f238e 100644
--- a/libjava/java/text/RuleBasedCollator.java
+++ b/libjava/java/text/RuleBasedCollator.java
@@ -38,7 +38,7 @@ exception statement from your version. */
package java.text;
import java.util.Enumeration;
-import java.util.Hashtable;
+import java.util.HashMap;
import java.util.Vector;
/* Written using "Java Class Libraries", 2nd edition, plus online
@@ -147,32 +147,104 @@ public class RuleBasedCollator extends Collator
* This class describes what rank has a character (or a sequence of characters)
* in the lexicographic order. Each element in a rule has a collation element.
*/
- final class CollationElement
+ final static class CollationElement
{
String key;
- char relation;
-
- CollationElement(String key, char relation)
+ int primary;
+ short secondary;
+ short tertiary;
+ short equality;
+ boolean ignore;
+ String expansion;
+
+ CollationElement(String key, int primary, short secondary, short tertiary,
+ short equality, String expansion, boolean ignore)
{
this.key = key;
- this.relation = relation;
+ this.primary = primary;
+ this.secondary = secondary;
+ this.tertiary = tertiary;
+ this.equality = equality;
+ this.ignore = ignore;
+ this.expansion = expansion;
+ }
+
+ final int getValue()
+ {
+ return (primary << 16) + (secondary << 8) + tertiary;
}
}
- // True if we are using French-style accent ordering.
- private boolean frenchAccents;
+ /**
+ * Basic collation instruction (internal format) to build the series of
+ * collation elements. It contains an instruction which specifies the new
+ * state of the generator. The sequence of instruction should not contain
+ * RESET (it is used by
+ * {@link #mergeRules(int,java.lang.String,java.util.Vector,java.util.Vector)})
+ * as a temporary state while merging two sets of instructions.
+ */
+ final static class CollationSorter
+ {
+ static final int GREATERP = 0;
+ static final int GREATERS = 1;
+ static final int GREATERT = 2;
+ static final int EQUAL = 3;
+ static final int RESET = 4;
+ static final int INVERSE_SECONDARY = 5;
+
+ int comparisonType;
+ String textElement;
+ int hashText;
+ int offset;
+ boolean ignore;
+
+ String expansionOrdering;
+ }
/**
* This the the original rule string.
*/
private String rules;
- // This maps strings onto collation values.
- private Hashtable map;
-
- // An entry in this hash means that more lookahead is required for
- // the prefix string.
- private Hashtable prefixes;
+ /**
+ * This is the table of collation element values
+ */
+ private Object[] ce_table;
+
+ /**
+ * Quick-prefix finder.
+ */
+ HashMap prefix_tree;
+
+ /**
+ * This is the value of the last sequence entered into
+ * <code>ce_table</code>. It is used to compute the
+ * ordering value of unspecified character.
+ */
+ private int last_primary_value;
+
+ /**
+ * This is the value of the last secondary sequence of the
+ * primary 0, entered into
+ * <code>ce_table</code>. It is used to compute the
+ * ordering value of an unspecified accented character.
+ */
+ private int last_tertiary_value;
+
+ /**
+ * This variable is true if accents need to be sorted
+ * in the other direction.
+ */
+ private boolean inverseAccentComparison;
+
+ /**
+ * This collation element is special to unknown sequence.
+ * The JDK uses it to mark and sort the characters which has
+ * no collation rules.
+ */
+ static final CollationElement SPECIAL_UNKNOWN_SEQ =
+ new CollationElement("", (short) 32767, (short) 0, (short) 0,
+ (short) 0, null, false);
/**
* This method initializes a new instance of <code>RuleBasedCollator</code>
@@ -192,128 +264,313 @@ public class RuleBasedCollator extends Collator
this.rules = rules;
- // We keep each rule in order in a vector. At the end we traverse
- // the vector and compute collation values from it.
- int insertion_index = 0;
- Vector vec = new Vector ();
+ buildCollationVector(parseString(rules));
+ buildPrefixAccess();
+ }
+ /**
+ * This method returns the number of common characters at the beginning
+ * of the string of the two parameters.
+ *
+ * @param prefix A string considered as a prefix to test against
+ * the other string.
+ * @param s A string to test the prefix against.
+ * @return The number of common characters.
+ */
+ static int findPrefixLength(String prefix, String s)
+ {
int index;
- StringBuffer argument = new StringBuffer();
- int len = rules.length();
+ int len = prefix.length();
- for (index = 0; index < len; ++index)
+ for (index = 0; index < len && index < s.length(); ++index)
{
- char c = rules.charAt(index);
+ if (prefix.charAt(index) != s.charAt(index))
+ return index;
+ }
- // Just skip whitespace.
- if (Character.isWhitespace(c))
- continue;
- // Modifier.
- if (c == '@')
+ return index;
+ }
+
+ /**
+ * Here we are merging two sets of sorting instructions: 'patch' into 'main'. This methods
+ * checks whether it is possible to find an anchor point for the rules to be merged and
+ * then insert them at that precise point.
+ *
+ * @param offset Offset in the string containing rules of the beginning of the rules
+ * being merged in.
+ * @param starter Text of the rules being merged.
+ * @param main Repository of all already parsed rules.
+ * @param patch Rules to be merged into the repository.
+ * @throws ParseException if it is impossible to find an anchor point for the new rules.
+ */
+ private void mergeRules(int offset, String starter, Vector main, Vector patch)
+ throws ParseException
+ {
+ Enumeration elements = main.elements();
+ int insertion_point = -1;
+ int max_length = 0;
+
+ /* We must check that no rules conflict with another already present. If it
+ * is the case delete the old rule.
+ */
+
+ /* For the moment good old O(N^2) algorithm.
+ */
+ for (int i = 0; i < patch.size(); i++)
+ {
+ int j = 0;
+
+ while (j < main.size())
{
- frenchAccents = true;
- continue;
+ CollationSorter rule1 = (CollationSorter) patch.elementAt(i);
+ CollationSorter rule2 = (CollationSorter) main.elementAt(j);
+
+ if (rule1.textElement.equals(rule2.textElement))
+ main.removeElementAt(j);
+ else
+ j++;
}
+ }
- // Check for relation or reset operator.
- if (! (c == '<' || c == ';' || c == ',' || c == '=' || c == '&'))
- throw new ParseException ("invalid character", index);
-
- ++index;
- while (index < len)
+ // Find the insertion point... O(N)
+ for (int i = 0; i < main.size(); i++)
+ {
+ CollationSorter sorter = (CollationSorter) main.elementAt(i);
+ int length = findPrefixLength(starter, sorter.textElement);
+
+ if (length > max_length)
{
- if (! Character.isWhitespace(rules.charAt(index)))
- break;
- ++index;
+ max_length = length;
+ insertion_point = i+1;
}
- if (index == len)
- throw new ParseException ("missing argument", index);
-
- int save = index;
- index = text_argument (rules, index, argument);
- if (argument.length() == 0)
- throw new ParseException ("invalid character", save);
- String arg = argument.toString();
- int item_index = -1;
-
- for (int j = 0; j < vec.size(); ++j)
- {
- CollationElement e = (CollationElement) vec.elementAt (j);
+ }
- if (arg.equals (e.key))
- {
- item_index = j;
- break;
- }
- }
+ if (insertion_point < 0)
+ throw new ParseException("no insertion point found for " + starter, offset);
+
+ if (max_length < starter.length())
+ {
+ /*
+ * We need to expand the first entry. It must be sorted
+ * like if it was the reference key itself (like the spec
+ * said. So the first entry is special: the element is
+ * replaced by the specified text element for the sorting.
+ * This text replace the old one for comparisons. However
+ * to preserve the behaviour we replace the first key (corresponding
+ * to the found prefix) by a new code rightly ordered in the
+ * sequence. The rest of the subsequence must be appended
+ * to the end of the sequence.
+ */
+ CollationSorter sorter = (CollationSorter) patch.elementAt(0);
+ CollationSorter expansionPrefix =
+ (CollationSorter) main.elementAt(insertion_point-1);
- if (c != '&')
+ sorter.expansionOrdering = starter.substring(max_length); // Skip the first good prefix element
+
+ main.insertElementAt(sorter, insertion_point);
+
+ /*
+ * This is a new set of rules. Append to the list.
+ */
+ patch.removeElementAt(0);
+ insertion_point++;
+ }
+
+ // Now insert all elements of patch at the insertion point.
+ for (int i = 0; i < patch.size(); i++)
+ main.insertElementAt(patch.elementAt(i), i+insertion_point);
+ }
+
+ /**
+ * This method parses a string and build a set of sorting instructions. The parsing
+ * may only be partial on the case the rules are to be merged sometime later.
+ *
+ * @param stop_on_reset If this parameter is true then the parser stops when it
+ * encounters a reset instruction. In the other case, it tries to parse the subrules
+ * and merged it in the same repository.
+ * @param v Output vector for the set of instructions.
+ * @param base_offset Offset in the string to begin parsing.
+ * @param rules Rules to be parsed.
+ * @return -1 if the parser reached the end of the string, an integer representing the
+ * offset in the string at which it stopped parsing.
+ * @throws ParseException if something turned wrong during the parsing. To get details
+ * decode the message.
+ */
+ private int subParseString(boolean stop_on_reset, Vector v,
+ int base_offset, String rules)
+ throws ParseException
+ {
+ boolean ignoreChars = (base_offset == 0);
+ int operator = -1;
+ StringBuffer sb = new StringBuffer();
+ boolean doubleQuote = false;
+ boolean eatingChars = false;
+ boolean nextIsModifier = false;
+ boolean isModifier = false;
+ int i;
+
+main_parse_loop:
+ for (i = 0; i < rules.length(); i++)
+ {
+ char c = rules.charAt(i);
+ int type = -1;
+
+ if (!eatingChars &&
+ ((c >= 0x09 && c <= 0x0D) || (c == 0x20)))
+ continue;
+
+ isModifier = nextIsModifier;
+ nextIsModifier = false;
+
+ if (eatingChars && c != '\'')
{
- // If the argument already appears in the vector, then we
- // must remove it in order to re-order.
- if (item_index != -1)
- {
- vec.removeElementAt(item_index);
- if (insertion_index >= item_index)
- --insertion_index;
- }
- CollationElement r = new CollationElement (arg, c);
- vec.insertElementAt(r, insertion_index);
- ++insertion_index;
+ doubleQuote = false;
+ sb.append(c);
+ continue;
}
- else
+ if (doubleQuote && eatingChars)
{
- // Reset.
- if (item_index == -1)
- throw
- new ParseException ("argument to reset not previously seen",
- save);
- insertion_index = item_index + 1;
+ sb.append(c);
+ doubleQuote = false;
+ continue;
}
- // Ugly: in this case the resulting INDEX comes from
- // text_argument, which returns the index of the next
- // character we should examine.
- --index;
- }
+ switch (c) {
+ case '!':
+ throw new ParseException
+ ("Modifier '!' is not yet supported by Classpath", i+base_offset);
+ case '<':
+ type = CollationSorter.GREATERP;
+ break;
+ case ';':
+ type = CollationSorter.GREATERS;
+ break;
+ case ',':
+ type = CollationSorter.GREATERT;
+ break;
+ case '=':
+ type = CollationSorter.EQUAL;
+ break;
+ case '\'':
+ eatingChars = !eatingChars;
+ doubleQuote = true;
+ break;
+ case '@':
+ if (ignoreChars)
+ throw new ParseException
+ ("comparison list has not yet been started. You may only use"
+ + "(<,;=&)", i+base_offset);
+ // Inverse the order of secondaries from now on.
+ nextIsModifier = true;
+ type = CollationSorter.INVERSE_SECONDARY;
+ break;
+ case '&':
+ type = CollationSorter.RESET;
+ if (stop_on_reset)
+ break main_parse_loop;
+ break;
+ default:
+ if (operator < 0)
+ throw new ParseException
+ ("operator missing at " + (i+base_offset), i+base_offset);
+ if (!eatingChars &&
+ ((c >= 0x21 && c <= 0x2F)
+ || (c >= 0x3A && c <= 0x40)
+ || (c >= 0x5B && c <= 0x60)
+ || (c >= 0x7B && c <= 0x7E)))
+ throw new ParseException
+ ("unquoted punctuation character '"+c+"'", i+base_offset);
+
+ //type = ignoreChars ? CollationSorter.IGNORE : -1;
+ sb.append(c);
+ break;
+ }
- // Now construct a hash table that maps strings onto their
- // collation values.
- int primary = 0;
- int secondary = 0;
- int tertiary = 0;
- this.map = new Hashtable ();
- this.prefixes = new Hashtable ();
- Enumeration e = vec.elements();
- while (e.hasMoreElements())
- {
- CollationElement r = (CollationElement) e.nextElement();
- switch (r.relation)
+ if (type < 0)
+ continue;
+
+ if (operator < 0)
{
- case '<':
- ++primary;
- secondary = 0;
- tertiary = 0;
- break;
- case ';':
- ++secondary;
- tertiary = 0;
- break;
- case ',':
- ++tertiary;
- break;
- case '=':
- break;
+ operator = type;
+ continue;
}
- // This must match CollationElementIterator.
- map.put(r.key, new Integer (primary << 16
- | secondary << 8 | tertiary));
- // Make a map of all lookaheads we might need.
- for (int i = r.key.length() - 1; i >= 1; --i)
- prefixes.put(r.key.substring(0, i), Boolean.TRUE);
+ if (sb.length() == 0 && !isModifier)
+ throw new ParseException
+ ("text element empty at " + (i+base_offset), i+base_offset);
+
+ if (operator == CollationSorter.RESET)
+ {
+ /* Reposition in the sorting list at the position
+ * indicated by the text element.
+ */
+ String subrules = rules.substring(i);
+ Vector sorted_rules = new Vector();
+ int idx;
+
+ // Parse the subrules but do not iterate through all
+ // sublist. This is the priviledge of the first call.
+ idx = subParseString(true, sorted_rules, base_offset+i, subrules);
+
+ // Merge new parsed rules into the list.
+ mergeRules(base_offset+i, sb.toString(), v, sorted_rules);
+ sb.setLength(0);
+
+ // Reset state to none.
+ operator = -1;
+ type = -1;
+ // We have found a new subrule at 'idx' but it has not been parsed.
+ if (idx >= 0)
+ {
+ i += idx-1;
+ continue main_parse_loop;
+ }
+ else
+ // No more rules.
+ break main_parse_loop;
+ }
+
+ CollationSorter sorter = new CollationSorter();
+
+ if (operator == CollationSorter.GREATERP)
+ ignoreChars = false;
+
+ sorter.comparisonType = operator;
+ sorter.textElement = sb.toString();
+ sorter.hashText = sorter.textElement.hashCode();
+ sorter.offset = base_offset+rules.length();
+ sorter.ignore = ignoreChars;
+ sb.setLength(0);
+
+ v.add(sorter);
+ operator = type;
+ }
+
+ if (operator >= 0)
+ {
+ CollationSorter sorter = new CollationSorter();
+ int pos = rules.length() + base_offset;
+
+ if ((sb.length() != 0 && nextIsModifier)
+ || (sb.length() == 0 && !nextIsModifier && !eatingChars))
+ throw new ParseException("text element empty at " + pos, pos);
+
+ if (operator == CollationSorter.GREATERP)
+ ignoreChars = false;
+
+ sorter.comparisonType = operator;
+ sorter.textElement = sb.toString();
+ sorter.hashText = sorter.textElement.hashCode();
+ sorter.offset = base_offset+pos;
+ sorter.ignore = ignoreChars;
+ v.add(sorter);
}
+
+ if (i == rules.length())
+ return -1;
+ else
+ return i;
}
/**
@@ -323,90 +580,130 @@ public class RuleBasedCollator extends Collator
*/
public Object clone()
{
- RuleBasedCollator c = (RuleBasedCollator) super.clone ();
- c.map = (Hashtable) map.clone ();
- c.prefixes = (Hashtable) map.clone ();
- return c;
+ return super.clone();
}
- // A helper for CollationElementIterator.next().
- int ceiNext (CollationElementIterator cei)
+ /**
+ * This method completely parses a string 'rules' containing sorting rules.
+ *
+ * @param rules String containing the rules to be parsed.
+ * @return A set of sorting instructions stored in a Vector.
+ * @throws ParseException if something turned wrong during the parsing. To get details
+ * decode the message.
+ */
+ private Vector parseString(String rules)
+ throws ParseException
{
- if (cei.lookahead_set)
- {
- cei.lookahead_set = false;
- return cei.lookahead;
- }
+ Vector v = new Vector();
- int save = cei.textIndex;
- int max = cei.text.length();
- String s = null;
-
- // It is possible to have a case where `abc' has a mapping, but
- // neither `ab' nor `abd' do. In this case we must treat `abd' as
- // nothing special.
- boolean found = false;
+ // result of the first subParseString is not absolute (may be -1 or a
+ // positive integer). But we do not care.
+ subParseString(false, v, 0, rules);
+
+ return v;
+ }
- int i;
- for (i = save + 1; i <= max; ++i)
+ /**
+ * This method uses the sorting instructions built by {@link #parseString}
+ * to build collation elements which can be directly used to sort strings.
+ *
+ * @param parsedElements Parsed instructions stored in a Vector.
+ * @throws ParseException if the order of the instructions are not valid.
+ */
+ private void buildCollationVector(Vector parsedElements)
+ throws ParseException
+ {
+ int primary_seq = 0;
+ int last_tertiary_seq = 0;
+ short secondary_seq = 0;
+ short tertiary_seq = 0;
+ short equality_seq = 0;
+ boolean inverseComparisons = false;
+ final boolean DECREASING = false;
+ final boolean INCREASING = true;
+ boolean secondaryType = INCREASING;
+ Vector v = new Vector();
+
+ // elts is completely sorted.
+element_loop:
+ for (int i = 0; i < parsedElements.size(); i++)
{
- s = cei.text.substring(save, i);
- if (prefixes.get(s) == null)
- break;
- found = true;
- }
- // Assume s != null.
+ CollationSorter elt = (CollationSorter) parsedElements.elementAt(i);
+ boolean ignoreChar = false;
- Object obj = map.get(s);
- // The special case.
- while (found && obj == null && s.length() > 1)
- {
- --i;
- s = cei.text.substring(save, i);
- obj = map.get(s);
+ switch (elt.comparisonType)
+ {
+ case CollationSorter.GREATERP:
+ primary_seq++;
+ if (inverseComparisons)
+ {
+ secondary_seq = Short.MAX_VALUE;
+ secondaryType = DECREASING;
+ }
+ else
+ {
+ secondary_seq = 0;
+ secondaryType = INCREASING;
+ }
+ tertiary_seq = 0;
+ equality_seq = 0;
+ inverseComparisons = false;
+ break;
+ case CollationSorter.GREATERS:
+ if (secondaryType == DECREASING)
+ secondary_seq--;
+ else
+ secondary_seq++;
+ tertiary_seq = 0;
+ equality_seq = 0;
+ break;
+ case CollationSorter.INVERSE_SECONDARY:
+ inverseComparisons = true;
+ continue element_loop;
+ case CollationSorter.GREATERT:
+ tertiary_seq++;
+ if (primary_seq == 0)
+ last_tertiary_seq = tertiary_seq;
+ equality_seq = 0;
+ break;
+ case CollationSorter.EQUAL:
+ equality_seq++;
+ break;
+ case CollationSorter.RESET:
+ throw new ParseException
+ ("Invalid reached state 'RESET'. Internal error", elt.offset);
+ default:
+ throw new ParseException
+ ("Invalid unknown state '" + elt.comparisonType + "'", elt.offset);
+ }
+
+ v.add(new CollationElement(elt.textElement, primary_seq,
+ secondary_seq, tertiary_seq,
+ equality_seq, elt.expansionOrdering, elt.ignore));
}
- // Update state.
- cei.textIndex = i;
+ this.inverseAccentComparison = inverseComparisons;
- if (obj == null)
- {
- // This idea, and the values, come from JDK.
- // assert (s.length() == 1)
- cei.lookahead_set = true;
- cei.lookahead = s.charAt(0) << 8;
- return 0x7fff << 16;
- }
+ ce_table = v.toArray();
- return ((Integer) obj).intValue();
+ last_primary_value = primary_seq+1;
+ last_tertiary_value = last_tertiary_seq+1;
}
- // A helper for compareTo() that returns the next character that has
- // a nonzero ordering at the indicated strength. This is also used
- // in CollationKey.
- static final int next (CollationElementIterator iter, int strength)
+ /**
+ * Build a tree where all keys are the texts of collation elements and data is
+ * the collation element itself. The tree is used when extracting all prefix
+ * for a given text.
+ */
+ private void buildPrefixAccess()
{
- while (true)
+ prefix_tree = new HashMap();
+
+ for (int i = 0; i < ce_table.length; i++)
{
- int os = iter.next();
- if (os == CollationElementIterator.NULLORDER)
- return os;
- int c = 0;
- switch (strength)
- {
- case PRIMARY:
- c = os & ~0xffff;
- break;
- case SECONDARY:
- c = os & ~0x00ff;
- break;
- case TERTIARY:
- case IDENTICAL:
- c = os;
- break;
- }
- if (c != 0)
- return c;
+ CollationElement e = (CollationElement) ce_table[i];
+
+ prefix_tree.put(e.key, e);
}
}
@@ -425,34 +722,115 @@ public class RuleBasedCollator extends Collator
public int compare(String source, String target)
{
CollationElementIterator cs, ct;
+ CollationElement ord1block = null;
+ CollationElement ord2block = null;
+ boolean advance_block_1 = true;
+ boolean advance_block_2 = true;
- cs = new CollationElementIterator(this, source);
- ct = new CollationElementIterator(this, target);
+ cs = getCollationElementIterator(source);
+ ct = getCollationElementIterator(target);
for(;;)
{
- int os = next (cs, strength);
- int ot = next (ct, strength);
+ int ord1;
+ int ord2;
+
+ /*
+ * We have to check whether the characters are ignorable.
+ * If it is the case then forget them.
+ */
+ if (advance_block_1)
+ {
+ ord1block = cs.nextBlock();
+ if (ord1block != null && ord1block.ignore)
+ continue;
+ }
+
+ if (advance_block_2)
+ {
+ ord2block = ct.nextBlock();
+ if (ord2block != null && ord2block.ignore)
+ {
+ advance_block_1 = false;
+ continue;
+ }
+ }
+ else
+ advance_block_2 = true;
- if (os == CollationElementIterator.NULLORDER
- && ot == CollationElementIterator.NULLORDER)
- break;
- else if (os == CollationElementIterator.NULLORDER)
+ if (!advance_block_1)
+ advance_block_1 = true;
+
+ if (ord1block != null)
+ ord1 = ord1block.getValue();
+ else
{
- // Source string is shorter, so return "less than".
+ if (ord2block == null)
+ return 0;
return -1;
}
- else if (ot == CollationElementIterator.NULLORDER)
+
+ if (ord2block == null)
+ return 1;
+
+ ord2 = ord2block.getValue();
+
+ // We know chars are totally equal, so skip
+ if (ord1 == ord2)
{
- // Target string is shorter, so return "greater than".
- return 1;
+ if (getStrength() == IDENTICAL)
+ if (!ord1block.key.equals(ord2block.key))
+ return ord1block.key.compareTo(ord2block.key);
+ continue;
}
- if (os != ot)
- return os - ot;
- }
+ // Check for primary strength differences
+ int prim1 = CollationElementIterator.primaryOrder(ord1);
+ int prim2 = CollationElementIterator.primaryOrder(ord2);
+
+ if (prim1 == 0 && getStrength() < TERTIARY)
+ {
+ advance_block_2 = false;
+ continue;
+ }
+ else if (prim2 == 0 && getStrength() < TERTIARY)
+ {
+ advance_block_1 = false;
+ continue;
+ }
+
+ if (prim1 < prim2)
+ return -1;
+ else if (prim1 > prim2)
+ return 1;
+ else if (getStrength() == PRIMARY)
+ continue;
+
+ // Check for secondary strength differences
+ int sec1 = CollationElementIterator.secondaryOrder(ord1);
+ int sec2 = CollationElementIterator.secondaryOrder(ord2);
+
+ if (sec1 < sec2)
+ return -1;
+ else if (sec1 > sec2)
+ return 1;
+ else if (getStrength() == SECONDARY)
+ continue;
+
+ // Check for tertiary differences
+ int tert1 = CollationElementIterator.tertiaryOrder(ord1);
+ int tert2 = CollationElementIterator.tertiaryOrder(ord2);
+
+ if (tert1 < tert2)
+ return -1;
+ else if (tert1 > tert2)
+ return 1;
+ else if (getStrength() == TERTIARY)
+ continue;
- return 0;
+ // Apparently JDK does this (at least for my test case).
+ return ord1block.key.compareTo(ord2block.key);
+ }
}
/**
@@ -467,13 +845,54 @@ public class RuleBasedCollator extends Collator
*/
public boolean equals(Object obj)
{
- if (! (obj instanceof RuleBasedCollator) || ! super.equals(obj))
+ if (obj == this)
+ return true;
+ else
return false;
- RuleBasedCollator rbc = (RuleBasedCollator) obj;
- // FIXME: this is probably wrong. Instead we should compare maps
- // directly.
- return (frenchAccents == rbc.frenchAccents
- && rules.equals(rbc.rules));
+ }
+
+ /**
+ * This method builds a default collation element without invoking
+ * the database created from the rules passed to the constructor.
+ *
+ * @param c Character which needs a collation element.
+ * @return A valid brand new CollationElement instance.
+ */
+ CollationElement getDefaultElement(char c)
+ {
+ int v;
+
+ // Preliminary support for generic accent sorting inversion (I don't know if all
+ // characters in the range should be sorted backward). This is the place
+ // to fix this if needed.
+ if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
+ v = 0x0361 - ((int) c - 0x02B9);
+ else
+ v = (short) c;
+ return new CollationElement("" + c, last_primary_value + v,
+ (short) 0, (short) 0, (short) 0, null, false);
+ }
+
+ /**
+ * This method builds a default collation element for an accented character
+ * without invoking the database created from the rules passed to the constructor.
+ *
+ * @param c Character which needs a collation element.
+ * @return A valid brand new CollationElement instance.
+ */
+ CollationElement getDefaultAccentedElement(char c)
+ {
+ int v;
+
+ // Preliminary support for generic accent sorting inversion (I don't know if all
+ // characters in the range should be sorted backward). This is the place
+ // to fix this if needed.
+ if (inverseAccentComparison && (c >= 0x02B9 && c <= 0x0361))
+ v = 0x0361 - ((int) c - 0x02B9);
+ else
+ v = (short) c;
+ return new CollationElement("" + c, (short) 0,
+ (short) 0, (short) (last_tertiary_value + v), (short) 0, null, false);
}
/**
@@ -489,13 +908,7 @@ public class RuleBasedCollator extends Collator
*/
public CollationElementIterator getCollationElementIterator(String source)
{
- int len = source.length();
- StringBuffer expand = new StringBuffer(len);
-
- for (int index = 0; index < len; ++index)
- decomposeCharacter(source.charAt(index), expand);
-
- return new CollationElementIterator(this, expand.toString());
+ return new CollationElementIterator(this, source);
}
/**
@@ -517,7 +930,7 @@ public class RuleBasedCollator extends Collator
c = source.next())
decomposeCharacter(c, expand);
- return new CollationElementIterator(this, expand.toString());
+ return getCollationElementIterator(expand.toString());
}
/**
@@ -533,8 +946,52 @@ public class RuleBasedCollator extends Collator
*/
public CollationKey getCollationKey(String source)
{
- return new CollationKey(this, getCollationElementIterator(source), source,
- strength);
+ CollationElementIterator cei = getCollationElementIterator(source);
+ Vector vect = new Vector(25);
+
+ int ord = cei.next();
+ cei.reset(); //set to start of string
+
+ while (ord != CollationElementIterator.NULLORDER)
+ {
+ // If the primary order is null, it means this is an ignorable
+ // character.
+ if (CollationElementIterator.primaryOrder(ord) == 0)
+ {
+ ord = cei.next();
+ continue;
+ }
+ switch (getStrength())
+ {
+ case PRIMARY:
+ ord = CollationElementIterator.primaryOrder(ord);
+ break;
+
+ case SECONDARY:
+ ord = CollationElementIterator.primaryOrder(ord) << 8;
+ ord |= CollationElementIterator.secondaryOrder(ord);
+
+ default:
+ break;
+ }
+
+ vect.add(new Integer(ord));
+ ord = cei.next(); //increment to next key
+ }
+
+ Object[] objarr = vect.toArray();
+ byte[] key = new byte[objarr.length * 4];
+
+ for (int i = 0; i < objarr.length; i++)
+ {
+ int j = ((Integer) objarr[i]).intValue();
+ key [i * 4] = (byte) ((j & 0xFF000000) >> 24);
+ key [i * 4 + 1] = (byte) ((j & 0x00FF0000) >> 16);
+ key [i * 4 + 2] = (byte) ((j & 0x0000FF00) >> 8);
+ key [i * 4 + 3] = (byte) (j & 0x000000FF);
+ }
+
+ return new CollationKey(this, source, key);
}
/**
@@ -555,44 +1012,6 @@ public class RuleBasedCollator extends Collator
*/
public int hashCode()
{
- return (frenchAccents ? 1231 : 1237
- ^ rules.hashCode()
- ^ map.hashCode()
- ^ prefixes.hashCode());
+ return System.identityHashCode(this);
}
-
- private final boolean is_special (char c)
- {
- // Rules from JCL book.
- return ((c >= 0x0021 && c <= 0x002f)
- || (c >= 0x003a && c <= 0x0040)
- || (c >= 0x005b && c <= 0x0060)
- || (c >= 0x007b && c <= 0x007e));
- }
-
- private final int text_argument (String rules, int index,
- StringBuffer result)
- {
- result.setLength(0);
- int len = rules.length();
- while (index < len)
- {
- char c = rules.charAt (index);
- if (c == '\''
- && index + 2 < len
- && rules.charAt (index + 2) == '\'')
- {
- result.append (rules.charAt (index + 1));
- index += 2;
- }
- else if (is_special (c))
- return index;
- else if (!Character.isWhitespace (c))
- result.append (c);
-
- ++index;
- }
- return index;
- }
-
}