示例#1
0
  public static void extractPhrase() {

    System.out.println("name of forward alignment");
    String fwA = UTIL_UserInput.fileNameInput();
    HashMap<Bitext, AlignPair> currentMap =
        (HashMap<Bitext, AlignPair>) UTIL_FileOperations.openObject(fwA);
    System.out.println("name of backward alignment");
    String bwA = UTIL_UserInput.fileNameInput();
    HashMap<Bitext, AlignPair> bwAlignment =
        (HashMap<Bitext, AlignPair>) UTIL_FileOperations.openObject(bwA);
    String mapName = "forward.aMap";
    for (int i = 0; i < 2; i++) {
      if (i == 1) {
        currentMap = bwAlignment;
        mapName = "reverse.aMap";
      }
      // Iterator<Entry<Bitext, AlignPair>> fIt = fwAlignment.entrySet()
      // .iterator();
      HashMap<Bitext, HashMap<Integer, Integer>> alignmentMap =
          new HashMap<Bitext, HashMap<Integer, Integer>>();
      Iterator<Entry<Bitext, AlignPair>> cait = currentMap.entrySet().iterator();
      while (cait.hasNext()) {
        HashMap<Integer, Integer> sourceTargetPair = new HashMap<Integer, Integer>();
        Map.Entry<Bitext, AlignPair> pair = cait.next();
        AlignPair forwardAlign = pair.getValue();
        Bitext currentBitext = pair.getKey();
        AlignPair backwardAlign = bwAlignment.get(currentBitext);
        ArrayList<WordPair> forwardViterbi = forwardAlign.getAlignments();
        Integer sourceIndex = 0;

        ArrayList<Integer> targetsUsed = new ArrayList<Integer>();
        // for forward alignment
        for (String s : currentBitext.getSource()) {
          Integer targetIndex = 0;
          Integer closestTarget = currentBitext.getTarget().length;
          for (WordPair wp : forwardViterbi) {
            if (wp.e.equals(s)) {
              String target = wp.f;
              ArrayList<Integer> targetPositions = new ArrayList<Integer>();
              for (String t : currentBitext.getTarget()) {
                if (t.equals(target)) {
                  targetPositions.add(targetIndex);
                }
                targetIndex++;
              }
              Integer source = sourceIndex;

              if (targetPositions.size() > 1) {
                System.out.println("multiple target words");
                Double sourcePercentage = (double) (sourceIndex / currentBitext.getSource().length);
                Double smallest = 100000000000000d;
                for (Integer tp : targetPositions) {
                  Double targetPercentage = (double) (tp / currentBitext.getTarget().length);
                  Double distance = Math.abs(sourcePercentage - targetPercentage);
                  if (smallest > distance) {
                    smallest = distance;
                    closestTarget = tp;
                  }
                }

              } else if (targetPositions.size() == 1) {
                closestTarget = targetPositions.get(0);
              }

              break;
            }
          }
          System.out.println(sourceIndex);
          System.out.println(closestTarget);
          if (closestTarget >= currentBitext.getTarget().length) {
            System.out.println("Something is wrong");
            System.exit(1);
          }
          sourceTargetPair.put(sourceIndex, closestTarget);
          sourceIndex++;
        }
        System.out.println("For bitext id: " + currentBitext.hashCode());
        Iterator<Entry<Integer, Integer>> ait = sourceTargetPair.entrySet().iterator();
        while (ait.hasNext()) {
          Map.Entry<Integer, Integer> ip = ait.next();
          System.out.println(
              ip.getKey()
                  + " : "
                  + ip.getValue()
                  + "----"
                  + currentBitext.getSource()[ip.getKey()]
                  + " : "
                  + currentBitext.getTarget()[ip.getValue()]);
        }
        alignmentMap.put(currentBitext, sourceTargetPair);
      }

      UTIL_FileOperations.store(alignmentMap, mapName);
    }
  }
  /**
   * Main method drives all methods
   *
   * @param args
   * @throws IOException
   */
  public static void main(String[] args) throws IOException {

    String usageError =
        "Please provide a valid option. Such as: "
            + "\n -add FILENAME 				*creates new HITs from the data provided in the given file(s)* "
            + "\n -delete FILENAME        	*deletes all of the HITs with IDs matching those given in the file(s)*"
            + "\n -approveAll FILENAME 		*approves all the assignments for all HITs with IDs in the given file(s)*";

    if (args.length >= 1) {
      // Create an instance of this class.
      LexicalSubSurvey app = new LexicalSubSurvey();
      File inputFile = null;

      try {
        if (args.length > 1) inputFile = new File(args[1]);

        if (args[0].equals("-add")) {
          // When -add tag is given in adds HITs to Mechanical turk depending on the URL in the
          // mturk.properties file

          String[] parts = {
            "NN", "NNS", "JJ", "JJR", "JJS", "RB", "RBR", "RBS", "VB", "VBD", "VBG", "VBN", "VBP",
            "VBZ"
          };
          ArrayList<String> pos = new ArrayList<String>();
          for (int i = 0; i < parts.length; i++) {
            pos.add(parts[i]);
          }

          ExamplePairReader reader = new ExamplePairReader(PARSED, ALIGN);
          BufferedReader in =
              new BufferedReader(
                  new InputStreamReader(
                      new FileInputStream(inputFile))); // typical file name: "sub.simple.first100"
          DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-ddHH:mm:ss");
          Date date = new Date();

          // The three different experiments leave one uncommented at a time to do single groupings
          app.contextpr =
              new PrintWriter(
                  new FileOutputStream(
                      new File(inputFile.getName() + "ContextGivenIDs" + dateFormat.format(date))));
          app.partialContextpr =
              new PrintWriter(
                  new FileOutputStream(
                      new File(
                          inputFile.getName() + "partialContextIDs" + dateFormat.format(date))));
          app.noContextpr =
              new PrintWriter(
                  new FileOutputStream(
                      new File(
                          inputFile.getName() + "NoContextGivenIDs" + dateFormat.format(date))));

          Map<String, String> codeToPOS = new HashMap<String, String>(14);
          codeToPOS.put("NN", "Noun");
          codeToPOS.put("NNS", "Noun");
          codeToPOS.put("JJ", "Adjective");
          codeToPOS.put("JJR", "Adjective");
          codeToPOS.put("JJS", "Adjective");
          codeToPOS.put("RB", "Adverb");
          codeToPOS.put("RBR", "Adverb");
          codeToPOS.put("RBS", "Adverb");
          codeToPOS.put("VB", "Verb");
          codeToPOS.put("VBD", "Verb");
          codeToPOS.put("VBG", "Verb");
          codeToPOS.put("VBN", "Verb");
          codeToPOS.put("VBP", "Verb");
          codeToPOS.put("VBZ", "Verb");

          String input = in.readLine();
          Map<String, String[]> wordToSense = new HashMap<String, String[]>(25);
          String focusWord = "";
          String sense = "";
          String context = "";
          String simpleWord;
          while (input != null) {
            StringTokenizer splitter = new StringTokenizer(input, "\t");
            context = splitter.nextToken();
            splitter.nextToken();
            focusWord = splitter.nextToken();
            simpleWord = splitter.nextToken();
            sense = splitter.nextToken();

            String[] wordAssociations = {context, sense, simpleWord};

            wordToSense.put(focusWord, wordAssociations);

            input = in.readLine();
          }

          for (int k = 0;
              k < 1000000 && reader.hasNext();
              k++) { // for counted input goes through until reaches end or max number
            ExamplePair p = reader.next();
            Alignment align = p.getAlignment();
            ArrayList<Word> normalWords = p.getNormal().getWords();
            ArrayList<Word> simpleWords = p.getSimple().getWords();

            // creates object = list of simple words
            SimpleWordsList simpleWordsList = new SimpleWordsList();

            for (AlignPair pair : align) {
              int n = pair.getNormalIndex();
              int s = pair.getSimpleIndex();
              Word normal = normalWords.get(n);
              Word simple = simpleWords.get(s);
              boolean diffWords =
                  !normal.getWord().toLowerCase().equals(simple.getWord().toLowerCase());
              boolean normWordSimplePOS = pos.contains(normal.getPos());
              boolean posEqual = normal.getPos().equals(simple.getPos());
              boolean normalIsAlreadySimple = simpleWordsList.contains(normal.getWord());
              boolean doWeHaveSense = wordToSense.containsKey(normal.getWord());
              if (doWeHaveSense) context = wordToSense.get(normal.getWord())[0];
              boolean contextMatch = context.equals(p.getNormal().textString());

              if (diffWords
                  && normWordSimplePOS
                  && posEqual
                  && !normalIsAlreadySimple
                  && doWeHaveSense
                  && contextMatch) {
                String firstPart = "";
                String partialFirst = "";
                String wordAfterFocus = normalWords.get(n + 1).getWord();
                String target = normal.getWord();
                if (!(wordAfterFocus.length() == 1 && wordAfterFocus.compareTo("A") < 0)) {
                  target += " ";
                }
                String secondPart = "";
                String partialSecond = "";
                sense = wordToSense.get(normal.getWord())[1];
                String POS = codeToPOS.get(normal.getPos());

                for (int i = 0; i < normalWords.size(); i++) {
                  String currentWord = normalWords.get(i).getWord();
                  String nextWord = "";
                  if (i + 1 < normalWords.size()) {
                    nextWord = normalWords.get(i + 1).getWord();
                  }
                  if (i < n) {
                    if (i > n - 3) partialFirst += currentWord;
                    firstPart += currentWord;
                    if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) {
                      firstPart += " ";
                      if (i > n - 3) partialFirst += " ";
                    }
                  }
                  if (i > n) {
                    if (i < n + 3) partialSecond += currentWord;
                    secondPart += currentWord;
                    if (!(nextWord.length() == 1 && nextWord.compareTo("A") < 0)) {
                      secondPart += " ";
                      if (i < n + 3) partialSecond += " ";
                    }
                  }
                }

                // comment out 2 out of the 3 for single grouping
                app.createContextGivenSurvey(firstPart, target, secondPart);
                app.createPartialContextGivenSurvey(
                    partialFirst, target, partialSecond, sense, POS);
                app.createNoContextGivenSurvey(target, sense, POS);
              }
            }
          }

          // comment out 2 for single grouping
          app.contextpr.close();
          app.partialContextpr.close();
          app.noContextpr.close();

        } else if (args[0].equals("-delete")) { // deletes the hits whose IDs are in the given file
          System.out.println("deleting");
          // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs,
          // ContextGivenIDs
          BufferedReader fileReader =
              new BufferedReader(new InputStreamReader(new FileInputStream(inputFile)));
          String hitId = "";

          for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) {
            System.out.println(hitId);
            app.deleteHIT(hitId);
          }

        } else if (args[0].equals(
            "-approveAll")) { // approves all submissions for all hits whose IDs in the given file
          System.out.println("approving");
          // IDs are usually stored in these files: NoContextGivenIDs, NoTargetGivenIDs,
          // ContextGivenIDs
          BufferedReader fileReader =
              new BufferedReader(new InputStreamReader(new FileInputStream(inputFile)));
          String hitId = "";

          for (hitId = fileReader.readLine(); hitId != null; hitId = fileReader.readLine()) {
            System.out.println(hitId);
            app.approveHIT(hitId);
          }

        } else {
          System.err.println("No valid options were provided");
          System.out.println(usageError);
        }

      } catch (IOException e) {
        System.err.println("Could not find the file: \"" + args[1] + "\"");
        System.err.println("Please provide a valid file name");
      }

    } else System.out.println(usageError);
  }