/** * Makes a copy of a vertex neighbour list file with only vertices in given list. Reads in a file * (typically an vertex neighbour list) and outputs a copy except for any lines whose first entry * is not in given vertex list. * * <p>Each line has entries separated by whitespace. Comment lines are always copied. * * @param vertexNameToIndex map from vertex name to an index * @param fullInputFileName full filename of input file to be copied. * @param fullOutputFileName full filename of output file * @param fullOtherInputFileName full filename of second input file needed for synchronisation * @param cc if, after any white space, first word starts with this string then line is treated as * a comment line. * @param sep sepation character, e.g. tab * @param vertexColumnList list of columns to be compared * @param convertToIndex convert the vertex names to an index as given by the map provided * @param forceLowerCase force all strings to be lower case * @param infoOn true if want more info printed out */ public static int synchroniseVertexList( Map<String, Integer> vertexNameToIndex, String fullInputFileName, String fullOutputFileName, String fullOtherInputFileName, String cc, String sep, Set<Integer> vertexColumnList, boolean convertToIndex, boolean forceLowerCase, boolean infoOn) { int maxColumn = -1; for (Integer vertexColumn : vertexColumnList) { if (vertexColumn < 1) throw new RuntimeException( "first column is numbered 1, column with vertices given as " + vertexColumn); maxColumn = Math.max(vertexColumn, maxColumn); } int res = 0; // error code. boolean dontTestForCommentLine = false; if (cc.length() == 0) dontTestForCommentLine = true; // set up input file TextReader data = FileInput.openFile(fullInputFileName); if (data == null) { System.err.println( "*** synchroniseVertexList input file " + fullInputFileName + " not opened"); return -1; } if (infoOn) { System.out.print("Starting to do vertex synchronisation on columns "); for (Integer vertexColumn : vertexColumnList) System.out.print(vertexColumn + ", "); System.out.println(" of file " + fullInputFileName); if (convertToIndex) System.out.println("Converting names to indices"); } // Make a copy of vertices and note when they have been found. TreeSet<String> vertexFoundLL = new TreeSet(); TreeSet<String> vertexNotFound = new TreeSet(); // vertices in vertexNameToIndex not in file // set up output file PrintStream PS; FileOutputStream fout; try { fout = new FileOutputStream(fullOutputFileName); PS = new PrintStream(fout); } catch (FileNotFoundException e) { throw new RuntimeException( "**** synchroniseVertexList output file " + fullOutputFileName + " not opened, " + e.getMessage()); // return -2; } int linenumber = 0; int verticesInput = 0; int verticesOutput = 0; String[] vertexFound = new String[vertexColumnList.size()]; try { // Read the data from the input file. String[] numbers = new String[1000]; int column = 0; // number of columns on each line String vertex; while (data.eof() == false) { // Read until end-of-file. linenumber++; column = 0; // Read until end-of-line. if (forceLowerCase) while (data.eoln() == false) numbers[column++] = data.getWord().toLowerCase(); else while (data.eoln() == false) numbers[column++] = data.getWord(); if (maxColumn > column) throw new RuntimeException( "on line " + linenumber + " found " + column + " columns (first is numbered 1) but synchronising on column " + maxColumn); try { // next tests for first word starting for comment line string but only if this is a // nontrivial string if (dontTestForCommentLine || !numbers[0].startsWith(cc)) { boolean lineOK = true; int vc = 0; for (Integer vertexColumn : vertexColumnList) { vertex = numbers[vertexColumn - 1]; verticesInput++; if (!vertexNameToIndex.containsKey(vertex)) { lineOK = false; } vertexFound[vc++] = vertex; } if (!lineOK) continue; // the while statement // try to use vertexFoundLL.addAll; verticesOutput+=vertexColumnList.length; for (vc = 0; vc < vertexFound.length; vc++) { verticesOutput++; vertexFoundLL.add(vertexFound[vc]); } // eo for vc } // eo if (dontTestForCommentLine ... // Now write this line out to output file. // This will happen if its a comment line or if all vertices in specified columns are in // the given vertex list int columnMinusOne = column - 1; // this will be the index of the last column if (convertToIndex) for (int c = 0; c < column; c++) { PS.print( ((vertexColumnList.contains(c)) ? vertexNameToIndex.get(numbers[c]) : numbers[c]) + (c == columnMinusOne ? "" : sep)); } else { for (int c = 0; c < column; c++) PS.print(numbers[c] + (c == columnMinusOne ? "" : sep)); } PS.println(); } // eo try catch (RuntimeException e) { throw new RuntimeException( "*** PROBLEM on line " + linenumber + " of input file, " + e.getMessage()); } } // eofile for (String name : vertexNameToIndex.keySet()) if (!vertexFoundLL.contains(name)) vertexNotFound.add(name); if (infoOn) { System.out.println( "Finished vertex synchronisation on first column of file " + fullInputFileName + " producing " + fullOutputFileName); System.out.println( " " + linenumber + " input lines found " + verticesInput + " lines with vertices, wrote " + verticesOutput + " lines with vertices in given list."); System.out.println( " Given " + vertexNameToIndex.size() + " distinct input vertices, " + vertexNotFound.size() + " were not used."); System.out.println(" Output " + vertexFoundLL.size() + " distinct output vertices."); } } // eo try catch (TextReader.Error e) { // Some problem reading the data from the input file. res = -3; throw new RuntimeException( "*** File Error in " + fullInputFileName + " or " + fullOutputFileName + ", " + e.getMessage()); } finally { // Finish by closing the files, // whatever else may have happened. try { data.close(); fout.close(); } catch (IOException e) { throw new RuntimeException( "*** File Error closing " + fullInputFileName + " or " + fullOutputFileName + ", " + e.getMessage()); } } // eo finally // write out information file String infoOutputFileName = fullOutputFileName + ".info.txt"; try { fout = new FileOutputStream(infoOutputFileName); PS = new PrintStream(fout); } catch (FileNotFoundException e) { res = -2; throw new RuntimeException( "**** synchroniseVertexList information file " + infoOutputFileName + " not opened, " + e.getMessage()); } PS.println( "Vertex synchronisation on first column of file " + fullInputFileName + "against file " + fullOtherInputFileName + " producing " + fullOutputFileName); PS.println( " " + linenumber + " input lines found " + verticesInput + " lines with vertices, wrote " + verticesOutput + " lines with vertices in given list."); PS.println( " Given " + vertexNameToIndex.size() + " distinct input vertices, " + vertexNotFound.size() + " were not used."); PS.println(" Output " + vertexFoundLL.size() + " distinct output vertices."); if (vertexNotFound.size() > 0) { PS.println("Vertices in input list not found in synchronised file"); PS.println("name" + sep + "index"); for (String name : vertexNotFound) PS.println(name + sep + vertexNameToIndex.get(name)); } try { } // eo try catch (RuntimeException e) { res = -4; throw new RuntimeException( "*** File Error in information output file " + infoOutputFileName + ", " + e.getMessage()); } finally { // Finish by closing the files, // whatever else may have happened. try { fout.close(); } catch (IOException e) { throw new RuntimeException( "*** File Error closing " + infoOutputFileName + ", " + e.getMessage()); } } return res; }
/** * Read in list two columns, title then ISSN, strings separated by white space. * * <p>Use <tt>(String[]) FileInputreadStringList(fullFileName).toArray()</tt> to get array of * strings instead of an ArrayList. * * @param fullFileName name of file including directories * @param infoLevel 0 = normal, 2= debugging, -2 = silent * @return list of journals found. */ public TreeSet<Journal> readSimpleJournalData(String fullFileName, int infoLevel) { TextReader tr = ProcessScopusJournalLists.openFile(fullFileName); if (tr == null) return null; if (infoLevel > -2) System.out.println("Starting to read list of strings from " + fullFileName); ArrayList<String> words = new ArrayList(); TreeSet<Journal> journalList = new TreeSet(); String[] labelList = {JournalTitleLabel, this.ISSNLabel}; int rowNumber = 0; try { String[] column; // first find header row and identify columns needed String header; // column = line.split("\\t+"); // split at every tab int[] columnIndex = null; while (tr.eof() == false && columnIndex == null) { rowNumber++; header = tr.getln(); column = header.split("\\t+"); // split at every tab columnIndex = testLabelRow(column, labelList); } if (columnIndex == null) throw new RuntimeException("*** no header columns found in fullFileName"); if (infoLevel > -1) System.out.println("... header in row " + rowNumber); if (infoLevel > 0) for (int c = 0; c < columnIndex.length; c++) System.out.println(labelList[c] + " in column " + columnIndex[c]); // now process main data String line; String ISSN; String title; Journal journal; while (tr.eof() == false) { rowNumber++; line = tr.getln(); column = line.split("\\t+"); // split at every tab ISSN = (column.length > 1 ? column[columnIndex[1]] : Journal.SUNSET); title = (column.length > 0 ? column[columnIndex[0]] : Journal.SUNSET); journal = new Journal(title, ISSN); journalList.add(journal); if (infoLevel > 1) System.out.println(rowNumber + " j=" + title + ", n=" + ISSN); } if (infoLevel > -2) System.out.println( "Finished reading journals from file " + fullFileName + " found " + journalList.size() + " journals"); } // eo try catch (TextReader.Error e) { // Some problem reading the noNamedata from the input file. throw new RuntimeException( "*** Input Error: readJournalData failed after " + journalList.size() + " journals, row " + rowNumber + ", " + e.getMessage()); } finally { tr.close(); } return journalList; }