@Before public void setup() throws IOException, URISyntaxException { ADTokenSampleStreamFactory factory = new ADTokenSampleStreamFactory(ADTokenSampleStreamFactory.Parameters.class); File dict = new File( getClass() .getClassLoader() .getResource("opennlp/tools/tokenize/latin-detokenizer.xml") .toURI()); File data = new File( getClass().getClassLoader().getResource("opennlp/tools/formats/ad.sample").toURI()); String[] args = { "-data", data.getCanonicalPath(), "-encoding", "UTF-8", "-lang", "pt", "-detokenizer", dict.getCanonicalPath() }; ObjectStream<TokenSample> tokenSampleStream = factory.create(args); TokenSample sample = tokenSampleStream.read(); while (sample != null) { samples.add(sample); sample = tokenSampleStream.read(); } }
public void run(String[] args) { if (0 == args.length) { System.out.println(getHelp()); } else { DoccatModel model = new DoccatModelLoader().load(new File(args[0])); DocumentCategorizerME doccat = new DocumentCategorizerME(model); ObjectStream<String> documentStream = new ParagraphStream(new PlainTextByLineStream(new InputStreamReader(System.in))); PerformanceMonitor perfMon = new PerformanceMonitor(System.err, "doc"); perfMon.start(); try { String document; while ((document = documentStream.read()) != null) { double prob[] = doccat.categorize(WhitespaceTokenizer.INSTANCE.tokenize(document)); String category = doccat.getBestCategory(prob); DocumentSample sample = new DocumentSample(category, document); System.out.println(sample.toString()); perfMon.incrementCounter(); } } catch (IOException e) { CmdLineUtil.handleStdinIoError(e); } perfMon.stopAndPrintFinalResult(); } }
public void run(String[] args) { Parameters params = validateAndParseParams(args, Parameters.class); File testData = new File(params.getCensusData()); File dictOutFile = new File(params.getDict()); CmdLineUtil.checkInputFile("Name data", testData); CmdLineUtil.checkOutputFile("Dictionary file", dictOutFile); FileInputStream sampleDataIn = CmdLineUtil.openInFile(testData); ObjectStream<StringList> sampleStream = new NameFinderCensus90NameStream(sampleDataIn, Charset.forName(params.getEncoding())); Dictionary mDictionary; try { System.out.println("Creating Dictionary..."); mDictionary = createDictionary(sampleStream); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while reading training data or indexing data: " + e.getMessage(), e); } finally { try { sampleStream.close(); } catch (IOException e) { // sorry this can fail.. } } System.out.println("Saving Dictionary..."); OutputStream out = null; try { out = new FileOutputStream(dictOutFile); mDictionary.serialize(out); } catch (IOException e) { throw new TerminateToolException( -1, "IO error while writing dictionary file: " + e.getMessage(), e); } finally { if (out != null) try { out.close(); } catch (IOException e) { // file might be damaged throw new TerminateToolException( -1, "Attention: Failed to correctly write dictionary:" + e.getMessage(), e); } } }
public StringList read() throws IOException { String line = lineStream.read(); StringList name = null; if ((line != null) && (!StringUtil.isEmpty(line))) { String name2; // find the location of the name separator in the line of data. int pos = line.indexOf(' '); if ((pos != -1)) { String parsed = line.substring(0, pos); // the data is in ALL CAPS ... so the easiest way is to convert // back to standard mixed case. if ((parsed.length() > 2) && (parsed.startsWith("MC"))) { name2 = parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1, 2).toLowerCase(locale) + parsed.substring(2, 3).toUpperCase(locale) + parsed.substring(3).toLowerCase(locale); } else { name2 = parsed.substring(0, 1).toUpperCase(locale) + parsed.substring(1).toLowerCase(locale); } name = new StringList(new String[] {name2}); } } return name; }
/** * Creates a dictionary. * * @param sampleStream stream of samples. * @return a {@code Dictionary} class containing the name dictionary built from the input file. * @throws IOException IOException */ public static Dictionary createDictionary(ObjectStream<StringList> sampleStream) throws IOException { Dictionary mNameDictionary = new Dictionary(true); StringList entry; entry = sampleStream.read(); while (entry != null) { if (!mNameDictionary.contains(entry)) { mNameDictionary.put(entry); } entry = sampleStream.read(); } return mNameDictionary; }
public void createModel(String modelType, String trainFile) throws IOException { Charset charset = Charset.forName("UTF-8"); System.out.println("File path:" + trainFile); ObjectStream<String> lineStream = new PlainTextByLineStream( new FileInputStream(ModelTypes.TRAIN_FILE_BASE_LOCATION + trainFile + ".train"), charset); ObjectStream<NameSample> sampleStream = new NameSampleDataStream(lineStream); TokenNameFinderModel model; BufferedOutputStream modelOut = null; try { Collections.<String, Object>emptyMap(); /** model = NameFinderME.train("en", modelType, sampleStream, null); */ model = NameFinderME.train( "en", modelType, sampleStream, (AdaptiveFeatureGenerator) null, Collections.<String, Object>emptyMap(), 100, 1); /** * model= NameFinderME.train("en", modelType, sampleStream, * Collections.<String,Object>emptyMap(), 70, 1); */ } finally { sampleStream.close(); } try { modelOut = new BufferedOutputStream( new FileOutputStream(ModelTypes.BIN_FILE_BASE_LOCATION + trainFile + ".bin")); model.serialize(modelOut); } finally { if (modelOut != null) modelOut.close(); } }
public static Dictionary buildNGramDictionary(ObjectStream<POSSample> samples, int cutoff) throws IOException { NGramModel ngramModel = new NGramModel(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); if (words.length > 0) ngramModel.add(new StringList(words), 1, 1); } ngramModel.cutoff(cutoff, Integer.MAX_VALUE); return ngramModel.toDictionary(true); }
public static void populatePOSDictionary( ObjectStream<POSSample> samples, MutableTagDictionary dict, int cutoff) throws IOException { System.out.println("Expanding POS Dictionary ..."); long start = System.nanoTime(); // the data structure will store the word, the tag, and the number of // occurrences Map<String, Map<String, AtomicInteger>> newEntries = new HashMap<String, Map<String, AtomicInteger>>(); POSSample sample; while ((sample = samples.read()) != null) { String[] words = sample.getSentence(); String[] tags = sample.getTags(); for (int i = 0; i < words.length; i++) { // only store words if (!StringPattern.recognize(words[i]).containsDigit()) { String word; if (dict.isCaseSensitive()) { word = words[i]; } else { word = StringUtil.toLowerCase(words[i]); } if (!newEntries.containsKey(word)) { newEntries.put(word, new HashMap<String, AtomicInteger>()); } String[] dictTags = dict.getTags(word); if (dictTags != null) { for (String tag : dictTags) { // for this tags we start with the cutoff Map<String, AtomicInteger> value = newEntries.get(word); if (!value.containsKey(tag)) { value.put(tag, new AtomicInteger(cutoff)); } } } if (!newEntries.get(word).containsKey(tags[i])) { newEntries.get(word).put(tags[i], new AtomicInteger(1)); } else { newEntries.get(word).get(tags[i]).incrementAndGet(); } } } } // now we check if the word + tag pairs have enough occurrences, if yes we // add it to the dictionary for (Entry<String, Map<String, AtomicInteger>> wordEntry : newEntries.entrySet()) { List<String> tagsForWord = new ArrayList<String>(); for (Entry<String, AtomicInteger> entry : wordEntry.getValue().entrySet()) { if (entry.getValue().get() >= cutoff) { tagsForWord.add(entry.getKey()); } } if (tagsForWord.size() > 0) { dict.put(wordEntry.getKey(), tagsForWord.toArray(new String[tagsForWord.size()])); } } System.out.println( "... finished expanding POS Dictionary. [" + (System.nanoTime() - start) / 1000000 + "ms]"); }
public void close() throws IOException { lineStream.close(); }
public void reset() throws IOException, UnsupportedOperationException { lineStream.reset(); }
public NameSample read() throws IOException { List<String> tokens = new ArrayList<String>(); List<String> neTypes = new ArrayList<String>(); boolean isClearAdaptiveData = false; // Empty line indicates end of sentence String line; while ((line = lineStream.read()) != null && !StringUtil.isEmpty(line)) { // clear adaptive data if document mark appears following // CoNLL03 conventions if (clearFeatures.equalsIgnoreCase("docstart") && line.startsWith("-DOCSTART-")) { isClearAdaptiveData = true; String emptyLine = lineStream.read(); if (!StringUtil.isEmpty(emptyLine)) throw new IOException("Empty line after -DOCSTART- not empty: '" + emptyLine + "'!"); continue; } String fields[] = line.split("\t"); if (fields.length == 2) { tokens.add(fields[0]); neTypes.add(fields[1]); } else { throw new IOException( "Expected two fields per line in training data, got " + fields.length + " for line '" + line + "'!"); } } // if no -DOCSTART- mark, check if we need to clear features every sentence if (clearFeatures.equalsIgnoreCase("yes")) { isClearAdaptiveData = true; } if (tokens.size() > 0) { // convert name tags into spans List<Span> names = new ArrayList<Span>(); int beginIndex = -1; int endIndex = -1; for (int i = 0; i < neTypes.size(); i++) { String neTag = neTypes.get(i); if (neTag.equals("O")) { // O means we don't have anything this round. if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); beginIndex = -1; endIndex = -1; } } else if (neTag.startsWith("B-")) { // B- prefix means we have two same entities of the same class next to each other if (beginIndex != -1) { names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); } beginIndex = i; endIndex = i + 1; } else if (neTag.startsWith("I-")) { // I- starts or continues a current name entity if (beginIndex == -1) { beginIndex = i; endIndex = i + 1; } else if (!neTag.endsWith(neTypes.get(beginIndex).substring(1))) { // we have a new tag type following a tagged word series // also may not have the same I- starting the previous! names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); beginIndex = i; endIndex = i + 1; } else { endIndex++; } } else { throw new IOException("Invalid tag: " + neTag); } } // if one span remains, create it here if (beginIndex != -1) names.add(extract(beginIndex, endIndex, neTypes.get(beginIndex))); return new NameSample( tokens.toArray(new String[tokens.size()]), names.toArray(new Span[names.size()]), isClearAdaptiveData); } else if (line != null) { // Just filter out empty events, if two lines in a row are empty return read(); } else { // source stream is not returning anymore lines return null; } }