protected boolean readTAssignFile(String tassignFile) { try { int i, j; BufferedReader reader = new BufferedReader(new InputStreamReader(new FileInputStream(tassignFile), "UTF-8")); String line; // z = new Vector[M]; // z = new ArrayList[M]; z = new short[M][]; data = new LDADataset(M); data.V = V; for (i = 0; i < M; i++) { line = reader.readLine(); StringTokenizer tknr = new StringTokenizer(line, " \t\r\n"); int length = tknr.countTokens(); // Vector<Integer> words = new Vector<Integer>(); // Vector<Integer> topics = new Vector<Integer>(); ArrayList<Integer> words = new ArrayList<>(); ArrayList<Integer> topics = new ArrayList<>(); for (j = 0; j < length; j++) { String token = tknr.nextToken(); StringTokenizer tknr2 = new StringTokenizer(token, ":"); if (tknr2.countTokens() != 2) { System.out.println("Invalid word-topic assignment line\n"); return false; } words.add(Integer.parseInt(tknr2.nextToken())); topics.add(Integer.parseInt(tknr2.nextToken())); } // end for each topic assignment // allocate and add new document to the corpus Document doc = new Document(words); data.setDoc(doc, i); // assign values for z // z[i] = new Vector<Integer>(); // z[i] = new ArrayList<>(topics.size()); // Note: Modify: init arraylist // size. z[i] = new short[topics.size()]; for (j = 0; j < topics.size(); j++) { // z[i].add(topics.get(j)); z[i][j] = (short) (int) topics.get(j); } } // end for each doc reader.close(); } catch (Exception e) { System.out.println("Error while loading model: " + e.getMessage()); e.printStackTrace(); return false; } return true; }
public Model inference(String[] strs) { // System.out.println("inference"); Model newModel = new Model(); // System.out.println("read dataset"); LDADataset dataset = LDADataset.readDataSet(strs, globalDict); return inference(dataset); }
/** Init parameters for inference reading new dataset from file */ public boolean initNewModel(LDACmdOption option, Model trnModel) { if (!init(option)) return false; LDADataset dataset = LDADataset.readDataSet(dir + File.separator + dfile, trnModel.data.localDict); if (dataset == null) { System.out.println("Fail to read dataset!\n"); return false; } return initNewModel(option, dataset, trnModel); }
/** load saved model */ public boolean loadModel() { if (!readOthersFile(dir + File.separator + modelName + othersSuffix)) return false; if (!readTAssignFile(dir + File.separator + modelName + tassignSuffix)) return false; // read dictionary Dictionary dict = new Dictionary(); if (!dict.readWordMap(dir + File.separator + wordMapFile)) return false; data.localDict = dict; return true; }
/** Init parameters for estimation */ public boolean initNewModel(LDACmdOption option) { if (!init(option)) return false; int m, n, w, k; p = new double[K]; data = LDADataset.readDataSet(dir + File.separator + dfile); if (data == null) { System.out.println("Fail to read training data!\n"); return false; } // + allocate memory and assign values for variables M = data.M; V = data.V; dir = option.dir; savestep = option.savestep; // K: from command line or default value // alpha, beta: from command line or default values // niters, savestep: from command line or default values nw = new int[V][K]; for (w = 0; w < V; w++) { for (k = 0; k < K; k++) { nw[w][k] = 0; } } nd = new int[M][K]; for (m = 0; m < M; m++) { for (k = 0; k < K; k++) { nd[m][k] = 0; } } nwsum = new int[K]; for (k = 0; k < K; k++) { nwsum[k] = 0; } ndsum = new int[M]; for (m = 0; m < M; m++) { ndsum[m] = 0; } z = new Vector[M]; for (m = 0; m < data.M; m++) { int N = data.docs[m].length; z[m] = new Vector<Integer>(); // initilize for z for (n = 0; n < N; n++) { int topic = (int) Math.floor(Math.random() * K); z[m].add(topic); // number of instances of word assigned to topic j nw[data.docs[m].words[n]][topic] += 1; // number of words in document i assigned to topic j nd[m][topic] += 1; // total number of words assigned to topic j nwsum[topic] += 1; } // total number of words in document i ndsum[m] = N; } theta = new double[M][K]; phi = new double[K][V]; return true; }