protected boolean readTAssignFile(String tassignFile) {
    try {
      int i, j;
      BufferedReader reader =
          new BufferedReader(new InputStreamReader(new FileInputStream(tassignFile), "UTF-8"));

      String line;
      //            z = new Vector[M];
      //            z = new ArrayList[M];
      z = new short[M][];
      data = new LDADataset(M);
      data.V = V;
      for (i = 0; i < M; i++) {
        line = reader.readLine();
        StringTokenizer tknr = new StringTokenizer(line, " \t\r\n");

        int length = tknr.countTokens();

        //                Vector<Integer> words = new Vector<Integer>();
        //                Vector<Integer> topics = new Vector<Integer>();
        ArrayList<Integer> words = new ArrayList<>();
        ArrayList<Integer> topics = new ArrayList<>();

        for (j = 0; j < length; j++) {
          String token = tknr.nextToken();

          StringTokenizer tknr2 = new StringTokenizer(token, ":");
          if (tknr2.countTokens() != 2) {
            System.out.println("Invalid word-topic assignment line\n");
            return false;
          }

          words.add(Integer.parseInt(tknr2.nextToken()));
          topics.add(Integer.parseInt(tknr2.nextToken()));
        } // end for each topic assignment

        // allocate and add new document to the corpus
        Document doc = new Document(words);
        data.setDoc(doc, i);

        // assign values for z
        //                z[i] = new Vector<Integer>();
        //                z[i] = new ArrayList<>(topics.size()); // Note: Modify: init arraylist
        // size.
        z[i] = new short[topics.size()];
        for (j = 0; j < topics.size(); j++) {
          //                    z[i].add(topics.get(j));
          z[i][j] = (short) (int) topics.get(j);
        }
      } // end for each doc

      reader.close();
    } catch (Exception e) {
      System.out.println("Error while loading model: " + e.getMessage());
      e.printStackTrace();
      return false;
    }
    return true;
  }
示例#2
0
  public Model inference(String[] strs) {
    // System.out.println("inference");
    Model newModel = new Model();

    // System.out.println("read dataset");
    LDADataset dataset = LDADataset.readDataSet(strs, globalDict);

    return inference(dataset);
  }
  /** Init parameters for inference reading new dataset from file */
  public boolean initNewModel(LDACmdOption option, Model trnModel) {
    if (!init(option)) return false;

    LDADataset dataset =
        LDADataset.readDataSet(dir + File.separator + dfile, trnModel.data.localDict);
    if (dataset == null) {
      System.out.println("Fail to read dataset!\n");
      return false;
    }

    return initNewModel(option, dataset, trnModel);
  }
  /** load saved model */
  public boolean loadModel() {
    if (!readOthersFile(dir + File.separator + modelName + othersSuffix)) return false;

    if (!readTAssignFile(dir + File.separator + modelName + tassignSuffix)) return false;

    // read dictionary
    Dictionary dict = new Dictionary();
    if (!dict.readWordMap(dir + File.separator + wordMapFile)) return false;

    data.localDict = dict;

    return true;
  }
  /** Init parameters for estimation */
  public boolean initNewModel(LDACmdOption option) {
    if (!init(option)) return false;

    int m, n, w, k;
    p = new double[K];

    data = LDADataset.readDataSet(dir + File.separator + dfile);
    if (data == null) {
      System.out.println("Fail to read training data!\n");
      return false;
    }

    // + allocate memory and assign values for variables
    M = data.M;
    V = data.V;
    dir = option.dir;
    savestep = option.savestep;

    // K: from command line or default value
    // alpha, beta: from command line or default values
    // niters, savestep: from command line or default values

    nw = new int[V][K];
    for (w = 0; w < V; w++) {
      for (k = 0; k < K; k++) {
        nw[w][k] = 0;
      }
    }

    nd = new int[M][K];
    for (m = 0; m < M; m++) {
      for (k = 0; k < K; k++) {
        nd[m][k] = 0;
      }
    }

    nwsum = new int[K];
    for (k = 0; k < K; k++) {
      nwsum[k] = 0;
    }

    ndsum = new int[M];
    for (m = 0; m < M; m++) {
      ndsum[m] = 0;
    }

    z = new Vector[M];
    for (m = 0; m < data.M; m++) {
      int N = data.docs[m].length;
      z[m] = new Vector<Integer>();

      // initilize for z
      for (n = 0; n < N; n++) {
        int topic = (int) Math.floor(Math.random() * K);
        z[m].add(topic);

        // number of instances of word assigned to topic j
        nw[data.docs[m].words[n]][topic] += 1;
        // number of words in document i assigned to topic j
        nd[m][topic] += 1;
        // total number of words assigned to topic j
        nwsum[topic] += 1;
      }
      // total number of words in document i
      ndsum[m] = N;
    }

    theta = new double[M][K];
    phi = new double[K][V];

    return true;
  }