Example #1
0
  /**
   * @param targetTerm
   * @param sourceFile
   * @param termWindowSize
   * @param pipe
   */
  private static InstanceList readConcordanceFileToInstanceList(
      String targetTerm,
      String sourceFile,
      int termWindowSize,
      Pipe pipe,
      boolean useCollocationalVector) {
    InstanceList instanceList = new InstanceList(pipe);
    BufferedReader in = null;
    try {
      in = new BufferedReader(new FileReader(sourceFile));
      int incomplete = 0;

      String str;
      while ((str = in.readLine()) != null) {
        String[] lineArray = str.split(";");

        if (lineArray.length != 4) {
          System.out.println(
              "WARNING: Skipping possibly invalid CSV line " + str + " in file " + sourceFile);
          continue;
        }

        String docID = lineArray[0].replace("Doc ID: ", "").trim();
        String lineID = lineArray[1].replace("Line ID: ", "").trim();
        String instanceID = (docID + "_" + lineID).replaceAll(" ", "_");
        String senseID = lineArray[2].replace("DOE sense ID: ", "").trim();
        String text = lineArray[3];

        if (targetTerm.equals("faeder")) targetTerm = "fæder";

        ArrayList<String> data = corpus.getWindowTokens(targetTerm, docID, lineID, termWindowSize);

        if (data.size() != 2 * termWindowSize) {
          incomplete++;
          System.out.println("WARNING: Incomplete token list " + incomplete + " found " + data);
        }

        if (useCollocationalVector) {
          System.out.println("Converting data to collocational vector: \n\t" + data);
          int i = termWindowSize * (-1);
          int index = i + termWindowSize;

          while (i <= termWindowSize && index < data.size()) {
            if (i != 0) {
              data.set(index, data.get(index) + "_" + i); // skip position of target term
              index++;
            }

            i++;
          }
          System.out.println("Converting data to collocational vector...DONE\n\t" + data);
        }

        String dataStr =
            data.toString().replace(", ", " ").replace("[", "").replace("]", "").replace(".", "");
        Instance trainingInstance = new Instance(dataStr, senseID, instanceID, text);

        instanceList.addThruPipe(trainingInstance);
      }
      in.close();
    } catch (FileNotFoundException e) {
      e.printStackTrace();
    } catch (IOException e) {
      e.printStackTrace();
    } finally {
      if (in != null)
        try {
          in.close();
        } catch (IOException e1) {
        }
    }

    return instanceList;
  }