public StringList read() throws IOException { String line = lineStream.read(); StringList name = null; if ((line != null) && (!StringUtil.isEmpty(line))) { String name2; // find the location of the name separator in the line of data. int pos = line.indexOf(' '); if ((pos != -1)) { String parsed = line.substring(0, pos); // the data is in ALL CAPS ... so the easiest way is to convert // back to standard mixed case. if ((parsed.length() > 2) && (parsed.startsWith("MC"))) { name2 = parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1, 2).toLowerCase(locale) + parsed.substring(2, 3).toUpperCase(locale) + parsed.substring(3).toLowerCase(locale); } else { name2 = parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale); } name = new StringList(new String[] {name2}); } } return name; }
public NameSample read() throws IOException { List<String> tokens = new ArrayList<String>(); List<String> neTypes = new ArrayList<String>(); boolean isClearAdaptiveData = false; // Empty line indicates end of sentence String line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { // clear adaptive data if document mark appears following // CoNLL03 conventions if (clearFeatures.equalsIgnoreCase("docstart") && line.startsWith("-DOCSTART-")) { isClearAdaptiveData = true; String emptyLine = lineStream.read(); if (!StringUtil.isEmpty(emptyLine)) throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!"); continue; } String fields[] = line.split("\t"); if (fields.length == 2) { tokens.add(fields[0]); neTypes.add(fields[1]); } else { throw new IOException( "Expected two fields per line in training data, got " + fields.length + " for line '" + line + "'!"); } } // if no -DOCSTART- mark, check if we need to clear features every sentence if (clearFeatures.equalsIgnoreCase("yes")) { isClearAdaptiveData = true; } if (tokens.size() > 0) { // convert name tags into spans List<Span> names = new ArrayList<Span>(); int beginIndex = -1; int endIndex = -1; for (int i = 0; i < neTypes.size(); i++) { String neTag = neTypes.get(i); if (neTag.equals("O")) { // O means we don't have anything this round. if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); beginIndex = -1; endIndex = -1; } } else if (neTag.startsWith("B-")) { // B- prefix means we have two same entities of the same class next to each other if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); } beginIndex = i; endIndex = i + 1; } else if (neTag.startsWith("I-")) { // I- starts or continues a current name entity if (beginIndex == -1) { beginIndex = i; endIndex = i + 1; } else if (!neTag.endsWith(neTypes.get(beginIndex).substring(1))) { // we have a new tag type following a tagged word series // also may not have the same I- starting the previous! names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); beginIndex = i; endIndex = i + 1; } else { endIndex++; } } else { throw new IOException("Invalid tag: " + neTag); } } // if one span remains, create it here if (beginIndex != -1) names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); return new NameSample( tokens.toArray(new String[tokens.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData); } else if (line != null) { // Just filter out empty events, if two lines in a row are empty return read(); } else { // source stream is not returning anymore lines return null; } }