public StringList read() throws IOException { String line = lineStream.read(); StringList name = null; if ((line != null) && (!StringUtil.isEmpty(line))) { String name2; // find the location of the name separator in the line of data. int pos = line.indexOf(' '); if ((pos != -1)) { String parsed = line.substring(0, pos); // the data is in ALL CAPS ... so the easiest way is to convert // back to standard mixed case. if ((parsed.length() > 2) && (parsed.startsWith("MC"))) { name2 = parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1, 2).toLowerCase(locale) + parsed.substring(2, 3).toUpperCase(locale) + parsed.substring(3).toLowerCase(locale); } else { name2 = parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale); } name = new StringList(new String[] {name2}); } } return name; }
public Span[] tokenizePos(String s) { boolean isWhitespace; List<Span> tokens = new ArrayList<Span>(); int sl = s.length(); int start = -1; char pc = 0; for (int ci = 0; ci <= sl; ci++) { char c = ci < sl ? s.charAt(ci) : ' '; isWhitespace = StringUtil.isWhitespace(c); if (!isWhitespace & start < 0) { // new token starts start = ci; } if (isWhitespace && start >= 0) { // end of token // limited support for punctations at the end of words if (start < ci - 1 && (pc == '.' || pc == ',' || pc == '!' || pc == '?' || pc == ';' || pc == ':')) { tokens.add(new Span(start, ci - 1)); tokens.add(new Span(ci - 1, ci)); } else { tokens.add(new Span(start, ci)); } start = -1; } } return tokens.toArray(new Span[tokens.size()]); }
public static void populatePOSDictionary( ObjectStream<POSSample> samples, MutableTagDictionary dict, int cutoff) throws IOException { System.out.println("Expanding POS Dictionary ..."); long start = System.nanoTime(); // the data structure will store the word, the tag, and the number of // occurrences Map<String, Map<String, AtomicInteger>> newEntries = new HashMap<String, Map<String, AtomicInteger>>(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); String[] tags = sample.getTags(); for (int i = 0; i < words.length; i++) { // only store words if (!StringPattern.recognize(words[i]).containsDigit()) { String word; if (dict.isCaseSensitive()) { word = words[i]; } else { word = StringUtil.toLowerCase(words[i]); } if (!newEntries.containsKey(word)) { newEntries.put(word, new HashMap<String, AtomicInteger>()); } String[] dictTags = dict.getTags(word); if (dictTags != null) { for (String tag : dictTags) { // for this tags we start with the cutoff Map<String, AtomicInteger> value = newEntries.get(word); if (!value.containsKey(tag)) { value.put(tag, new AtomicInteger(cutoff)); } } } if (!newEntries.get(word).containsKey(tags[i])) { newEntries.get(word).put(tags[i], new AtomicInteger(1)); } else { newEntries.get(word).get(tags[i]).incrementAndGet(); } } } } // now we check if the word + tag pairs have enough occurrences, if yes we // add it to the dictionary for (Entry<String, Map<String, AtomicInteger>> wordEntry : newEntries.entrySet()) { List<String> tagsForWord = new ArrayList<String>(); for (Entry<String, AtomicInteger> entry : wordEntry.getValue().entrySet()) { if (entry.getValue().get() >= cutoff) { tagsForWord.add(entry.getKey()); } } if (tagsForWord.size() > 0) { dict.put(wordEntry.getKey(), tagsForWord.toArray(new String[tagsForWord.size()])); } } System.out.println( "... finished expanding POS Dictionary. [" + (System.nanoTime() - start) / 1000000 + "ms]"); }
public NameSample read() throws IOException { List<String> tokens = new ArrayList<String>(); List<String> neTypes = new ArrayList<String>(); boolean isClearAdaptiveData = false; // Empty line indicates end of sentence String line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { // clear adaptive data if document mark appears following // CoNLL03 conventions if (clearFeatures.equalsIgnoreCase("docstart") && line.startsWith("-DOCSTART-")) { isClearAdaptiveData = true; String emptyLine = lineStream.read(); if (!StringUtil.isEmpty(emptyLine)) throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!"); continue; } String fields[] = line.split("\t"); if (fields.length == 2) { tokens.add(fields[0]); neTypes.add(fields[1]); } else { throw new IOException( "Expected two fields per line in training data, got " + fields.length + " for line '" + line + "'!"); } } // if no -DOCSTART- mark, check if we need to clear features every sentence if (clearFeatures.equalsIgnoreCase("yes")) { isClearAdaptiveData = true; } if (tokens.size() > 0) { // convert name tags into spans List<Span> names = new ArrayList<Span>(); int beginIndex = -1; int endIndex = -1; for (int i = 0; i < neTypes.size(); i++) { String neTag = neTypes.get(i); if (neTag.equals("O")) { // O means we don't have anything this round. if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); beginIndex = -1; endIndex = -1; } } else if (neTag.startsWith("B-")) { // B- prefix means we have two same entities of the same class next to each other if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); } beginIndex = i; endIndex = i + 1; } else if (neTag.startsWith("I-")) { // I- starts or continues a current name entity if (beginIndex == -1) { beginIndex = i; endIndex = i + 1; } else if (!neTag.endsWith(neTypes.get(beginIndex).substring(1))) { // we have a new tag type following a tagged word series // also may not have the same I- starting the previous! names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); beginIndex = i; endIndex = i + 1; } else { endIndex++; } } else { throw new IOException("Invalid tag: " + neTag); } } // if one span remains, create it here if (beginIndex != -1) names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); return new NameSample( tokens.toArray(new String[tokens.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData); } else if (line != null) { // Just filter out empty events, if two lines in a row are empty return read(); } else { // source stream is not returning anymore lines return null; } }