Exemplo n.º 1
0
  /**
   * Makes a copy of a vertex neighbour list file with only vertices in given list. Reads in a file
   * (typically an vertex neighbour list) and outputs a copy except for any lines whose first entry
   * is not in given vertex list.
   *
   * <p>Each line has entries separated by whitespace. Comment lines are always copied.
   *
   * @param vertexNameToIndex map from vertex name to an index
   * @param fullInputFileName full filename of input file to be copied.
   * @param fullOutputFileName full filename of output file
   * @param fullOtherInputFileName full filename of second input file needed for synchronisation
   * @param cc if, after any white space, first word starts with this string then line is treated as
   *     a comment line.
   * @param sep sepation character, e.g. tab
   * @param vertexColumnList list of columns to be compared
   * @param convertToIndex convert the vertex names to an index as given by the map provided
   * @param forceLowerCase force all strings to be lower case
   * @param infoOn true if want more info printed out
   */
  public static int synchroniseVertexList(
      Map<String, Integer> vertexNameToIndex,
      String fullInputFileName,
      String fullOutputFileName,
      String fullOtherInputFileName,
      String cc,
      String sep,
      Set<Integer> vertexColumnList,
      boolean convertToIndex,
      boolean forceLowerCase,
      boolean infoOn) {
    int maxColumn = -1;
    for (Integer vertexColumn : vertexColumnList) {
      if (vertexColumn < 1)
        throw new RuntimeException(
            "first column is numbered 1, column with vertices given as " + vertexColumn);
      maxColumn = Math.max(vertexColumn, maxColumn);
    }
    int res = 0; // error code.
    boolean dontTestForCommentLine = false;
    if (cc.length() == 0) dontTestForCommentLine = true;

    // set up input file
    TextReader data = FileInput.openFile(fullInputFileName);
    if (data == null) {
      System.err.println(
          "*** synchroniseVertexList input file " + fullInputFileName + " not opened");
      return -1;
    }
    if (infoOn) {
      System.out.print("Starting to do vertex synchronisation on columns ");
      for (Integer vertexColumn : vertexColumnList) System.out.print(vertexColumn + ", ");
      System.out.println(" of file " + fullInputFileName);
      if (convertToIndex) System.out.println("Converting names to indices");
    }

    // Make a copy of vertices and note when they have been found.
    TreeSet<String> vertexFoundLL = new TreeSet();
    TreeSet<String> vertexNotFound = new TreeSet(); // vertices in vertexNameToIndex not in file
    // set up output file
    PrintStream PS;
    FileOutputStream fout;
    try {
      fout = new FileOutputStream(fullOutputFileName);
      PS = new PrintStream(fout);

    } catch (FileNotFoundException e) {
      throw new RuntimeException(
          "**** synchroniseVertexList output file "
              + fullOutputFileName
              + " not opened, "
              + e.getMessage());
      // return -2;
    }

    int linenumber = 0;
    int verticesInput = 0;
    int verticesOutput = 0;
    String[] vertexFound = new String[vertexColumnList.size()];
    try {
      // Read the data from the input file.
      String[] numbers = new String[1000];
      int column = 0; // number of columns on each line
      String vertex;
      while (data.eof() == false) { // Read until end-of-file.
        linenumber++;
        column = 0;
        // Read until end-of-line.
        if (forceLowerCase)
          while (data.eoln() == false) numbers[column++] = data.getWord().toLowerCase();
        else while (data.eoln() == false) numbers[column++] = data.getWord();
        if (maxColumn > column)
          throw new RuntimeException(
              "on line "
                  + linenumber
                  + " found "
                  + column
                  + " columns (first is numbered 1) but synchronising on column "
                  + maxColumn);
        try {
          // next tests for first word starting for comment line string but only if this is a
          // nontrivial string
          if (dontTestForCommentLine || !numbers[0].startsWith(cc)) {
            boolean lineOK = true;
            int vc = 0;
            for (Integer vertexColumn : vertexColumnList) {
              vertex = numbers[vertexColumn - 1];
              verticesInput++;
              if (!vertexNameToIndex.containsKey(vertex)) {
                lineOK = false;
              }
              vertexFound[vc++] = vertex;
            }
            if (!lineOK) continue; // the while statement
            // try to use vertexFoundLL.addAll; verticesOutput+=vertexColumnList.length;
            for (vc = 0; vc < vertexFound.length; vc++) {
              verticesOutput++;
              vertexFoundLL.add(vertexFound[vc]);
            } // eo for vc
          } // eo if (dontTestForCommentLine ...

          // Now write this line out to output file.
          // This will happen if its a comment line or if all vertices in specified columns are in
          // the given vertex list
          int columnMinusOne = column - 1; // this will be the index of the last column
          if (convertToIndex)
            for (int c = 0; c < column; c++) {
              PS.print(
                  ((vertexColumnList.contains(c)) ? vertexNameToIndex.get(numbers[c]) : numbers[c])
                      + (c == columnMinusOne ? "" : sep));
            }
          else {
            for (int c = 0; c < column; c++)
              PS.print(numbers[c] + (c == columnMinusOne ? "" : sep));
          }
          PS.println();
        } // eo try
        catch (RuntimeException e) {
          throw new RuntimeException(
              "*** PROBLEM on line " + linenumber + " of input file, " + e.getMessage());
        }
      } // eofile

      for (String name : vertexNameToIndex.keySet())
        if (!vertexFoundLL.contains(name)) vertexNotFound.add(name);

      if (infoOn) {
        System.out.println(
            "Finished vertex synchronisation on first column of file "
                + fullInputFileName
                + " producing "
                + fullOutputFileName);
        System.out.println(
            "    "
                + linenumber
                + " input lines found "
                + verticesInput
                + " lines with vertices, wrote "
                + verticesOutput
                + " lines with vertices in given list.");
        System.out.println(
            "      Given "
                + vertexNameToIndex.size()
                + " distinct input vertices, "
                + vertexNotFound.size()
                + " were not used.");
        System.out.println("      Output " + vertexFoundLL.size() + " distinct output vertices.");
      }
    } // eo try
    catch (TextReader.Error e) {
      // Some problem reading the data from the input file.
      res = -3;
      throw new RuntimeException(
          "*** File Error in "
              + fullInputFileName
              + " or "
              + fullOutputFileName
              + ", "
              + e.getMessage());

    } finally {
      // Finish by closing the files,
      //     whatever else may have happened.
      try {
        data.close();
        fout.close();
      } catch (IOException e) {
        throw new RuntimeException(
            "*** File Error closing "
                + fullInputFileName
                + " or "
                + fullOutputFileName
                + ", "
                + e.getMessage());
      }
    } // eo finally

    // write out information file
    String infoOutputFileName = fullOutputFileName + ".info.txt";
    try {
      fout = new FileOutputStream(infoOutputFileName);
      PS = new PrintStream(fout);
    } catch (FileNotFoundException e) {
      res = -2;
      throw new RuntimeException(
          "**** synchroniseVertexList information file "
              + infoOutputFileName
              + " not opened, "
              + e.getMessage());
    }
    PS.println(
        "Vertex synchronisation on first column of file "
            + fullInputFileName
            + "against file "
            + fullOtherInputFileName
            + " producing "
            + fullOutputFileName);
    PS.println(
        "    "
            + linenumber
            + " input lines found "
            + verticesInput
            + " lines with vertices, wrote "
            + verticesOutput
            + " lines with vertices in given list.");
    PS.println(
        "      Given "
            + vertexNameToIndex.size()
            + " distinct input vertices, "
            + vertexNotFound.size()
            + " were not used.");
    PS.println("      Output " + vertexFoundLL.size() + " distinct output vertices.");
    if (vertexNotFound.size() > 0) {
      PS.println("Vertices in input list not found in synchronised file");
      PS.println("name" + sep + "index");
      for (String name : vertexNotFound) PS.println(name + sep + vertexNameToIndex.get(name));
    }
    try {
    } // eo try
    catch (RuntimeException e) {
      res = -4;
      throw new RuntimeException(
          "*** File Error in information output file "
              + infoOutputFileName
              + ", "
              + e.getMessage());
    } finally {
      // Finish by closing the files,
      //     whatever else may have happened.
      try {
        fout.close();
      } catch (IOException e) {
        throw new RuntimeException(
            "*** File Error closing " + infoOutputFileName + ", " + e.getMessage());
      }
    }

    return res;
  }
  /**
   * Read in list two columns, title then ISSN, strings separated by white space.
   *
   * <p>Use <tt>(String[]) FileInputreadStringList(fullFileName).toArray()</tt> to get array of
   * strings instead of an ArrayList.
   *
   * @param fullFileName name of file including directories
   * @param infoLevel 0 = normal, 2= debugging, -2 = silent
   * @return list of journals found.
   */
  public TreeSet<Journal> readSimpleJournalData(String fullFileName, int infoLevel) {
    TextReader tr = ProcessScopusJournalLists.openFile(fullFileName);
    if (tr == null) return null;
    if (infoLevel > -2) System.out.println("Starting to read list of strings from " + fullFileName);
    ArrayList<String> words = new ArrayList();
    TreeSet<Journal> journalList = new TreeSet();

    String[] labelList = {JournalTitleLabel, this.ISSNLabel};
    int rowNumber = 0;
    try {
      String[] column;
      // first find header row and identify columns needed
      String header;
      //            column = line.split("\\t+"); // split at every tab
      int[] columnIndex = null;
      while (tr.eof() == false && columnIndex == null) {
        rowNumber++;
        header = tr.getln();
        column = header.split("\\t+"); // split at every tab
        columnIndex = testLabelRow(column, labelList);
      }
      if (columnIndex == null)
        throw new RuntimeException("*** no header columns found in fullFileName");
      if (infoLevel > -1) System.out.println("... header in row " + rowNumber);
      if (infoLevel > 0)
        for (int c = 0; c < columnIndex.length; c++)
          System.out.println(labelList[c] + " in column " + columnIndex[c]);

      // now process main data
      String line;
      String ISSN;
      String title;
      Journal journal;
      while (tr.eof() == false) {
        rowNumber++;
        line = tr.getln();
        column = line.split("\\t+"); // split at every tab
        ISSN = (column.length > 1 ? column[columnIndex[1]] : Journal.SUNSET);
        title = (column.length > 0 ? column[columnIndex[0]] : Journal.SUNSET);
        journal = new Journal(title, ISSN);
        journalList.add(journal);
        if (infoLevel > 1) System.out.println(rowNumber + " j=" + title + ", n=" + ISSN);
      }
      if (infoLevel > -2)
        System.out.println(
            "Finished reading journals from file "
                + fullFileName
                + " found "
                + journalList.size()
                + " journals");
    } // eo try
    catch (TextReader.Error e) {
      // Some problem reading the noNamedata from the input file.
      throw new RuntimeException(
          "*** Input Error: readJournalData failed after "
              + journalList.size()
              + " journals, row "
              + rowNumber
              + ", "
              + e.getMessage());
    } finally {
      tr.close();
    }
    return journalList;
  }