Source for gnu.java.text.WordBreakIterator

   1: /* WordBreakIterator.java - Default word BreakIterator.
   2:    Copyright (C) 1999, 2001, 2004 Free Software Foundation, Inc.
   3: 
   4: This file is part of GNU Classpath.
   5: 
   6: GNU Classpath is free software; you can redistribute it and/or modify
   7: it under the terms of the GNU General Public License as published by
   8: the Free Software Foundation; either version 2, or (at your option)
   9: any later version.
  10: 
  11: GNU Classpath is distributed in the hope that it will be useful, but
  12: WITHOUT ANY WARRANTY; without even the implied warranty of
  13: MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14: General Public License for more details.
  15: 
  16: You should have received a copy of the GNU General Public License
  17: along with GNU Classpath; see the file COPYING.  If not, write to the
  18: Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  19: 02110-1301 USA.
  20: 
  21: Linking this library statically or dynamically with other modules is
  22: making a combined work based on this library.  Thus, the terms and
  23: conditions of the GNU General Public License cover the whole
  24: combination.
  25: 
  26: As a special exception, the copyright holders of this library give you
  27: permission to link this library with independent modules to produce an
  28: executable, regardless of the license terms of these independent
  29: modules, and to copy and distribute the resulting executable under
  30: terms of your choice, provided that you also meet, for each linked
  31: independent module, the terms and conditions of the license of that
  32: module.  An independent module is a module which is not derived from
  33: or based on this library.  If you modify this library, you may extend
  34: this exception to your version of the library, but you are not
  35: obligated to do so.  If you do not wish to do so, delete this
  36: exception statement from your version. */
  37: 
  38: 
  39: package gnu.java.text;
  40: 
  41: import java.text.CharacterIterator;
  42: 
  43: /**
  44:  * @author Tom Tromey <tromey@cygnus.com>
  45:  * @date March 22, 1999
  46:  * Written using The Unicode Standard, Version 2.0.
  47:  */
  48: 
  49: public class WordBreakIterator extends BaseBreakIterator
  50: {
  51:   public Object clone ()
  52:   {
  53:     return new WordBreakIterator (this);
  54:   }
  55: 
  56:   public WordBreakIterator ()
  57:   {
  58:   }
  59: 
  60:   private WordBreakIterator (WordBreakIterator other)
  61:   {
  62:     iter = (CharacterIterator) other.iter.clone();
  63:   }
  64: 
  65:   // Some methods to tell us different properties of characters.
  66:   private final boolean isHira (char c)
  67:   {
  68:     return c >= 0x3040 && c <= 0x309f;
  69:   }
  70:   private final boolean isKata (char c)
  71:   {
  72:     return c >= 0x30a0 && c <= 0x30ff;
  73:   }
  74:   private final boolean isHan (char c)
  75:   {
  76:     return c >= 0x4e00 && c <= 0x9fff;
  77:   }
  78: 
  79:   public int next ()
  80:   {
  81:     int end = iter.getEndIndex();
  82:     if (iter.getIndex() == end)
  83:       return DONE;
  84: 
  85:     while (iter.getIndex() < end)
  86:       {
  87:         char c = iter.current();
  88:         if (c == CharacterIterator.DONE)
  89:           break;
  90:         int type = Character.getType(c);
  91: 
  92:         char n = iter.next();
  93:         if (n == CharacterIterator.DONE)
  94:           break;
  95: 
  96:         // Break after paragraph separators.
  97:         if (type == Character.PARAGRAPH_SEPARATOR
  98:             || type == Character.LINE_SEPARATOR)
  99:           break;
 100: 
 101:         // Break between letters and non-letters.
 102:         // FIXME: we treat apostrophe as part of a word.  This
 103:         // is an English-ism.
 104:         boolean is_letter = Character.isLetter(c);
 105:         if (c != '\'' && ! is_letter && type != Character.NON_SPACING_MARK
 106:             && Character.isLetter(n))
 107:           break;
 108: 
 109:         // Always break after certain symbols, such as punctuation.
 110:         // This heuristic is derived from hints in the JCL book and is
 111:         // not part of Unicode.  It seems to be right, however.
 112:         // FIXME: we treat apostrophe as part of a word.  This
 113:         // is an English-ism.
 114:         if (c != '\''
 115:             && (type == Character.DASH_PUNCTUATION
 116:                 || type == Character.START_PUNCTUATION
 117:                 || type == Character.END_PUNCTUATION
 118:                 || type == Character.CONNECTOR_PUNCTUATION
 119:                 || type == Character.OTHER_PUNCTUATION
 120:                 || type == Character.MATH_SYMBOL
 121:                 || type == Character.CURRENCY_SYMBOL
 122:                 || type == Character.MODIFIER_SYMBOL
 123:                 || type == Character.OTHER_SYMBOL
 124:                 || type == Character.FORMAT
 125:                 || type == Character.CONTROL))
 126:           break;
 127: 
 128:         boolean is_hira = isHira (c);
 129:         boolean is_kata = isKata (c);
 130:         boolean is_han = isHan (c);
 131: 
 132:         // Special case Japanese.
 133:         if (! is_hira && ! is_kata && ! is_han
 134:             && type != Character.NON_SPACING_MARK
 135:             && (isHira (n) || isKata (n) || isHan (n)))
 136:           break;
 137: 
 138:         if (is_hira || is_kata || is_han || is_letter)
 139:           {
 140:             // Now we need to do some lookahead.  We might need to do
 141:             // quite a bit of lookahead, so we save our position and
 142:             // restore it later.
 143:             int save = iter.getIndex();
 144:             // Skip string of non spacing marks.
 145:             while (n != CharacterIterator.DONE
 146:                    && Character.getType(n) == Character.NON_SPACING_MARK)
 147:               n = iter.next();
 148:             if (n == CharacterIterator.DONE)
 149:               break;
 150:             if ((is_hira && ! isHira (n))
 151:                 || (is_kata && ! isHira (n) && ! isKata (n))
 152:                 || (is_han && ! isHira (n) && ! isHan (n))
 153:                 // FIXME: we treat apostrophe as part of a word.  This
 154:                 // is an English-ism.
 155:                 || (is_letter && ! Character.isLetter(n) && n != '\''))
 156:               break;
 157:             iter.setIndex(save);
 158:           }
 159:       }
 160: 
 161:     return iter.getIndex();
 162:   }
 163: 
 164:   public int previous ()
 165:   {
 166:     int start = iter.getBeginIndex();
 167:     if (iter.getIndex() == start)
 168:       return DONE;
 169: 
 170:     while (iter.getIndex() >= start)
 171:       {
 172:         char c = iter.previous();
 173:         if (c == CharacterIterator.DONE)
 174:           break;
 175: 
 176:         boolean is_hira = isHira (c);
 177:         boolean is_kata = isKata (c);
 178:         boolean is_han = isHan (c);
 179:         boolean is_letter = Character.isLetter(c);
 180: 
 181:         char n = iter.previous();
 182:         if (n == CharacterIterator.DONE)
 183:           break;
 184:         iter.next();
 185:         int type = Character.getType(n);
 186:         // Break after paragraph separators.
 187:         if (type == Character.PARAGRAPH_SEPARATOR
 188:             || type == Character.LINE_SEPARATOR)
 189:           break;
 190: 
 191:         // Break between letters and non-letters.
 192:         // FIXME: we treat apostrophe as part of a word.  This
 193:         // is an English-ism.
 194:         if (n != '\'' && ! Character.isLetter(n)
 195:             && type != Character.NON_SPACING_MARK
 196:             && is_letter)
 197:           break;
 198: 
 199:         // Always break after certain symbols, such as punctuation.
 200:         // This heuristic is derived from hints in the JCL book and is
 201:         // not part of Unicode.  It seems to be right, however.
 202:         // FIXME: we treat apostrophe as part of a word.  This
 203:         // is an English-ism.
 204:         if (n != '\''
 205:             && (type == Character.DASH_PUNCTUATION
 206:                 || type == Character.START_PUNCTUATION
 207:                 || type == Character.END_PUNCTUATION
 208:                 || type == Character.CONNECTOR_PUNCTUATION
 209:                 || type == Character.OTHER_PUNCTUATION
 210:                 || type == Character.MATH_SYMBOL
 211:                 || type == Character.CURRENCY_SYMBOL
 212:                 || type == Character.MODIFIER_SYMBOL
 213:                 || type == Character.OTHER_SYMBOL
 214:                 || type == Character.FORMAT
 215:                 || type == Character.CONTROL))
 216:           break;
 217: 
 218:         // Special case Japanese.
 219:         if ((is_hira || is_kata || is_han)
 220:             && ! isHira (n) && ! isKata (n) && ! isHan (n)
 221:             && type != Character.NON_SPACING_MARK)
 222:           break;
 223: 
 224:         // We might have to skip over non spacing marks to see what's
 225:         // on the other side.
 226:         if (! is_hira || (! is_letter && c != '\''))
 227:           {
 228:             int save = iter.getIndex();
 229:             while (n != CharacterIterator.DONE
 230:                    && Character.getType(n) == Character.NON_SPACING_MARK)
 231:               n = iter.previous();
 232:             iter.setIndex(save);
 233:             // This is a strange case: a bunch of non-spacing marks at
 234:             // the beginning.  We treat the current location as a word
 235:             // break.
 236:             if (n == CharacterIterator.DONE)
 237:               break;
 238:             if ((isHira (n) && ! is_hira)
 239:                 || (isKata (n) && ! is_hira && ! is_kata)
 240:                 || (isHan (n) && ! is_hira && ! is_han)
 241:                 // FIXME: we treat apostrophe as part of a word.  This
 242:                 // is an English-ism.
 243:                 || (! is_letter && c != '\'' && Character.isLetter(n)))
 244:               break;
 245:           }
 246:       }
 247: 
 248:     return iter.getIndex();
 249:   }
 250: }