/** * Save the dataset to the given stream. * * @param os an output stream to where the dataset is saved. */ public void save(PrintStream ps) { for (int idxExample = 0; idxExample < getNumberOfExamples(); ++idxExample) { PQInput2 input = inputExamples[idxExample]; PQOutput2 output = outputExamples[idxExample]; // The sentence identifier string. ps.print(input.getId()); int numberOfQuotations = input.getNumberOfQuotations(); for (int quotationIdx = 0; quotationIdx < numberOfQuotations; ++quotationIdx) { // Quotation separator. ps.print("§"); int numberOfCoreferences = input.getNumberOfCoreferences(quotationIdx); for (int coreferenceIdx = 0; coreferenceIdx < numberOfCoreferences; ++coreferenceIdx) { // Coreference features. for (int ftr : input.getFeatureCodes(quotationIdx, coreferenceIdx)) ps.print(featureEncoding.getValueByCode(ftr) + " "); // Coreference separator. ps.print("\t"); } // Quotation author. ps.println(featureEncoding.getValueByCode(output.getAuthor(quotationIdx))); } // Next line for the next example. ps.println(); } }
/** * Add the examples in the given dataset to this dataset. * * @param other */ public void add(PQDataset2 other) throws DatasetException { if (!featureEncoding.equals(other.featureEncoding)) throw new DatasetException("Different encodings"); // Alloc room to store both datasets (this one and the given one). PQInput2[] newInputExamples = new PQInput2[inputExamples.length + other.inputExamples.length]; PQOutput2[] newOutputExamples = new PQOutput2[outputExamples.length + other.outputExamples.length]; // Copy (only reference) the examples in this dataset to the new arrays. int idx = 0; for (; idx < inputExamples.length; ++idx) { newInputExamples[idx] = inputExamples[idx]; newOutputExamples[idx] = outputExamples[idx]; } // Copy (only reference) the examples in the given dataset to the new // arrays. for (int idxO = 0; idxO < other.inputExamples.length; ++idxO, ++idx) { newInputExamples[idx] = other.inputExamples[idxO]; newOutputExamples[idx] = other.outputExamples[idxO]; } // Adjust the pointers of this dataset to the new arrays. this.inputExamples = newInputExamples; this.outputExamples = newOutputExamples; }
/** * Return the number of symbols in the dataset feature-value encoding object. In general, this * corresponds to the total number of different symbols in the dataset, but can be a different * number if the encoding was used by other code despite this dataset. * * @return */ public int getNumberOfSymbols() { return featureEncoding.size(); }
/** * Parse the given string and load an example. * * @param buff a string that contains an example. * @return <code>true</code> if the given string is a valid example. * @throws DatasetException if there is some format problem with the given string. */ public boolean parseExample( Collection<PQInput2> exampleInputs, Collection<PQOutput2> exampleOutputs, String buff) throws DatasetException { // Split quotations. String quotationsInput[] = buff.split("§"); if (quotationsInput.length == 0) return false; // First field: document ID. if (quotationsInput[0].trim().length() == 0) return false; String docId = quotationsInput[0]; LinkedList<LinkedList<LinkedList<Integer>>> exampleInputAsList = new LinkedList<LinkedList<LinkedList<Integer>>>(); LinkedList<Integer> exampleOutputAsList = new LinkedList<Integer>(); LinkedList<Quotation> quotationsAsList = new LinkedList<Quotation>(); // Walk into the document quotations. for (int idxQuote = 1; idxQuote < quotationsInput.length; idxQuote++) { // Split candidate coreferences of the given quotation. String coreferences[] = quotationsInput[idxQuote].split("\\t"); if (coreferences.length == 0) return false; // First field: quotation start index, quotation end index, // right coreference index. String firstField[] = coreferences[0].split("[ ]"); if (firstField.length != 3) return false; // Quotation start index, quotation end index. if ((firstField[0].trim().length() == 0) || (firstField[1].trim().length() == 0)) return false; Quotation quotation = new Quotation(coreferences.length - 1); quotation.setQuotationIndex(Integer.parseInt(firstField[0]), Integer.parseInt(firstField[1])); // Right coreference index. if (firstField[2].trim().length() == 0) return false; int rightCoref = Integer.parseInt(firstField[2]) + 1; // Walk into the coreference feature list. LinkedList<LinkedList<Integer>> corefFeatureList = new LinkedList<LinkedList<Integer>>(); for (int idxCoref = 1; idxCoref < coreferences.length; ++idxCoref) { String coreference = coreferences[idxCoref]; // Parse the given coreference features. String[] features = coreference.split("[ ]"); if (firstField.length < 2) return false; // First field: coreference start index, coreference end index. if ((features[0].trim().length() == 0) || (features[1].trim().length() == 0)) return false; quotation.setCoreferenceIndex( idxCoref - 1, Integer.parseInt(features[0]), Integer.parseInt(features[1])); // Encode the features. LinkedList<Integer> featureList = new LinkedList<Integer>(); for (int idxFtr = 2; idxFtr < features.length; ++idxFtr) { int code = featureEncoding.put(features[idxFtr]); if (code >= 0) featureList.add(code); } corefFeatureList.add(featureList); } // Example input. exampleInputAsList.add(corefFeatureList); // Example output. exampleOutputAsList.add(rightCoref); // Quotation index information. quotationsAsList.add(quotation); } // Store the loaded example. if (training) { /* * Training examples must store internally their indexes in the * array of training examples. */ exampleInputs.add(new PQInput2(docId, exampleInputAsList, quotationsAsList)); exampleOutputs.add(new PQOutput2(exampleOutputAsList)); } else { exampleInputs.add(new PQInput2(docId, exampleInputAsList, quotationsAsList)); exampleOutputs.add(new PQOutput2(exampleOutputAsList)); } return true; }