Esempio n. 1
0
  public static void prepareWord2VecCorpus(
      Properties properties, Parameters parameters, String output) throws Exception {
    File fAbstractDir = new File(PropertiesUtils.getString(properties, "corpusDir", ""));
    File fOutputFile = new File(output);
    Tokenizer tokenizer = new Tokenizer(true, ' ');
    MaxentTagger tagger = new MaxentTagger(PropertiesUtils.getString(properties, "pos_tagger", ""));
    Morphology morphology = new Morphology();

    OutputStreamWriter osw = new OutputStreamWriter(new FileOutputStream(fOutputFile), "utf-8");

    for (File abstractFile : fAbstractDir.listFiles()) {
      Abstract ab = (Abstract) ObjectSerializer.readObjectFromFile(abstractFile.getAbsolutePath());

      for (ADESentence sentence : ab.sentences) {
        List<CoreLabel> tokens = tokenizer.tokenize(sentence.offset, sentence.text);
        tagger.tagCoreLabels(tokens);
        for (int i = 0; i < tokens.size(); i++) morphology.stem(tokens.get(i));
        for (CoreLabel token : tokens) {
          String temp = NNADE.wordPreprocess(token, parameters);
          osw.write(temp + " ");
        }
        osw.write("\n");
      }
    }

    osw.close();

    System.out.println("Generate a word2vec corpus.");
    System.out.printf("wordPreprocess = %d%n", parameters.wordPreprocess);
    System.out.printf("embeddingSize = %d%n", parameters.embeddingSize);
  }
Esempio n. 2
0
  static void findClosestWord(String w) throws Exception {
    NNADE nnade = (NNADE) ObjectSerializer.readObjectFromFile("E:\\ade\\nnade.ser0");

    ArrayList<String> words = new ArrayList<>();
    ArrayList<Matrix> vectors = new ArrayList<>();
    Matrix wVector = null;
    for (String word : nnade.knownWords) {
      int index = nnade.wordIDs.get(word);
      if (index == 0 || index == 1) {
        System.out.println(word);
        continue;
      }
      if (w.equals(word)) {
        wVector = new Matrix(1, 50, nnade.E[index]);
        continue;
      }

      words.add(word);
      vectors.add(new Matrix(1, 50, nnade.E[index]));
    }

    // normalize
    /*for(int i=0;i<vectors.size();i++)
    Normalizer.doVectorNormalizing(vectors.get(i));*/

    int max = 100;
    ArrayList<Matrix> mat = new ArrayList<>();
    ArrayList<String> word = new ArrayList<>();
    for (int i = 0; i < vectors.size(); i++) {

      double currentDist = Matrix.distanceEuclidean(vectors.get(i), wVector);
      boolean inserted = false;
      for (int j = 0; j < mat.size(); j++) {
        double oldDist = Matrix.distanceEuclidean(mat.get(j), wVector);
        if (currentDist < oldDist) {
          mat.add(j, vectors.get(i));
          word.add(j, words.get(i));
          inserted = true;
          break;
        }
      }

      if (inserted == false) {
        mat.add(vectors.get(i));
        word.add(words.get(i));
      }
      if (mat.size() > max) {
        mat.remove(mat.size() - 1);
        word.remove(word.size() - 1);
      }
    }
    // dump result

    for (String t : word) {
      System.out.print(t + " ");
    }
  }
Esempio n. 3
0
  static void kmeansWord(Tool tool, Parameters parameters, List<Abstract> abs) throws Exception {
    NNADE nnade = (NNADE) ObjectSerializer.readObjectFromFile("E:\\ade\\nnade.ser0");
    HashSet<String> results =
        new HashSet<String>(
            Arrays.asList(
                new String[] {
                  "induce",
                  "clozapine",
                  "mtx",
                  "heparin",
                  "colchicine",
                  "methotrexate",
                  "thrombocytopenia",
                  "glaucoma",
                  "myopathy",
                  "thyrotoxicosis"
                }));

    /*HashSet<String> results = new HashSet<String>();
    HashSet<String> triggers = new HashSet<String>(Arrays.asList(new String[]
    {"induce", "associate", "relate", "cause", "develop", "produce", "after", "follow", "result"}));
    nnade.nn.preCompute();

          for(Abstract ab:abs) {
          	for(ADESentence gold:ab.sentences) {
          		List<CoreLabel> tokens = nnade.prepareNLPInfo(tool, gold);
          		ADESentence predicted = null;
          		predicted = nnade.decode(tokens, tool);

          		for(RelationEntity relation:predicted.relaitons) {
          			Entity former = relation.getFormer();
          			if(former.start!=former.end)
          				continue;
          			Entity latter = relation.getLatter();
          			if(latter.start!=latter.end)
          				continue;

          			for(int i=former.end+1;i<=latter.start-1;i++) {
          				if(triggers.contains(tokens.get(i).lemma().toLowerCase())) {
          					results.add(tokens.get(former.start).lemma().toLowerCase());
          					results.add(tokens.get(latter.start).lemma().toLowerCase());
          					results.add(tokens.get(i).lemma().toLowerCase());
          				}
          			}
          		}



          	}
          }*/

    ArrayList<String> words = new ArrayList<>();
    ArrayList<Matrix> vectors = new ArrayList<>();
    for (String word : results) {
      int index = nnade.wordIDs.get(word);
      if (index == 0 || index == 1) {
        System.out.println(word);
        continue;
      }

      words.add(word);
      vectors.add(new Matrix(1, 50, nnade.E[index]));
    }

    int k = 2;
    // normalize
    for (int i = 0; i < vectors.size(); i++) Normalizer.doVectorNormalizing(vectors.get(i));
    // do Buckshot
    Buckshot bs = new Buckshot(k, vectors);
    ArrayList<Matrix> centroids = bs.doBuckshot();

    // do KMeans
    KMeans mk = new KMeans(k, vectors, centroids, 1000);
    mk.getResults();

    OutputStreamWriter osw1 =
        new OutputStreamWriter(new FileOutputStream("E:\\ade\\kmeans.txt"), "utf-8");
    // dump result, i denotes class, j denotes word
    for (int i = 0; i < k; i++) {
      osw1.write("##the " + i + " class :\n");
      int line = 0;
      for (int j = 0; j < mk.vectors2classes.length; j++) {
        if (i == mk.vectors2classes[j]) {
          osw1.write(words.get(j) + " ");
          line++;
          if (line == 10) {
            osw1.write("\n");
            line = 0;
          }
        }
      }
      osw1.write("\n\n");
    }
    osw1.close();

    int max = 10;
    ArrayList<ArrayList<Matrix>> outMat = new ArrayList<ArrayList<Matrix>>();
    ArrayList<ArrayList<String>> outWord = new ArrayList<ArrayList<String>>();
    for (int i = 0; i < k; i++) {
      outMat.add(new ArrayList<Matrix>());
      outWord.add(new ArrayList<String>());
    }
    for (int i = 0; i < mk.vectors2classes.length; i++) {
      ArrayList<Matrix> mat = outMat.get(mk.vectors2classes[i]);
      ArrayList<String> word = outWord.get(mk.vectors2classes[i]);

      double currentDist = mk.similarity(vectors.get(i), mk.centroids.get(mk.vectors2classes[i]));
      boolean inserted = false;
      for (int j = 0; j < mat.size(); j++) {
        double oldDist = mk.similarity(mat.get(j), mk.centroids.get(mk.vectors2classes[i]));
        if (currentDist < oldDist) {
          mat.add(j, vectors.get(i));
          word.add(j, words.get(i));
          inserted = true;
          break;
        }
      }

      if (inserted == false) {
        mat.add(vectors.get(i));
        word.add(words.get(i));
      }
      if (mat.size() > max) {
        mat.remove(mat.size() - 1);
        word.remove(word.size() - 1);
      }
    }
    // dump result, i denotes class, j denotes word
    for (int i = 0; i < outWord.size(); i++) {
      ArrayList<String> word = outWord.get(i);
      System.out.print("##the " + (i + 1) + " class :\n");
      for (String w : word) {
        System.out.print(w + " ");
      }
      System.out.println();
    }
  }