Ejemplo n.º 1
0
 /**
  * @see java.io.RandomAccessFile#read(byte[], int, int)
  * @param bytes byte[]
  * @param off int offset
  * @param len int length
  * @return int bytes read or -1 on EOF
  * @throws IOException
  */
 public int read(byte bytes[], int off, int len) throws IOException {
   int pos = mappedByteBuffer.position();
   int limit = mappedByteBuffer.limit();
   if (pos == limit) return -1; // EOF
   int newlimit = pos + len - off;
   if (newlimit > limit) {
     len = limit - pos; // don't read beyond EOF
   }
   mappedByteBuffer.get(bytes, off, len);
   return len;
 }
Ejemplo n.º 2
0
 public void nextFrame() throws IOException {
   if (!client.writeBusy()) {
     int nextPos = buffer.position();
     nextPos = Math.min(nextPos + frameSize, fileSize);
     buffer.limit(nextPos);
     client.write(buffer, false);
     if (nextPos >= fileSize) {
       this.close();
     }
   }
 }
  @Override
  public void run() {
    if (hasRun) {
      requestShutdown();
      return;
    }
    log.info("running line splitter now");
    try {
      long maxStep = Math.max(1 << 27, 1 << outputRing.bitsOfUntructuredLayoutRingBuffer);
      final long fileSize = fileChannel.size();

      long pos = 0;

      int bytesRead;
      long startPos;
      do {
        startPos = pos;
        resetForNextByteBuffer(this);
        // System.err.println("Reading file bytes "+startPos+" to "+ (startPos+mapSize));
        MappedByteBuffer map =
            fileChannel.map(
                FileChannel.MapMode.READ_ONLY, startPos, Math.min(maxStep, fileSize - pos));

        do {
          bytesRead = parseSingleByteBuffer(this, map);
        } while (bytesRead < map.limit());

        pos += recordStart;
        if (showProgress) {
          System.out.println(
              " Progress:"
                  + (startPos + bytesRead)
                  + "/"
                  + fileSize
                  + "     "
                  + (((float) (startPos + bytesRead) * 100f) / (float) fileSize)
                  + "%");
        }
      } while (startPos + (long) bytesRead < fileSize);
      shutdownPosition = bytesRead;

      requestShutdown();
      log.trace("shutdown the line splitter");
      hasRun = true;

    } catch (Exception e) {
      throw new RuntimeException(e);
    }
  }
  /**
   * @see java.io.RandomAccessFile#read(byte[], int, int)
   * @param bytes byte[]
   * @param off int offset
   * @param len int length
   * @return int bytes read or -1 on EOF
   */
  public int read(byte bytes[], int off, int len) {
    int mapN = (int) (pos / BUFSIZE);
    int offN = (int) (pos % BUFSIZE);
    int totalRead = 0;

    while (totalRead < len) {
      if (mapN >= mappedBuffers.length) // we have run out of data to read from
      break;
      MappedByteBuffer currentBuffer = mappedBuffers[mapN];
      if (offN > currentBuffer.limit()) break;
      currentBuffer.position(offN);
      int bytesFromThisBuffer = Math.min(len - totalRead, currentBuffer.remaining());
      currentBuffer.get(bytes, off, bytesFromThisBuffer);
      off += bytesFromThisBuffer;
      pos += bytesFromThisBuffer;
      totalRead += bytesFromThisBuffer;

      mapN++;
      offN = 0;
    }
    return totalRead == 0 ? -1 : totalRead;
  }
  /** @param args */
  public static void main(String[] args) {
    String docPath = null;
    for (int i = 0; i < args.length; i++) { // iterate over cli parameter tokens
      if (args[i].startsWith("-")) { // assume we found a switch
        // get the relevant enum
        CLISwitch sw = CLISwitch.getEnumFromSwitch(args[i]);
        if (sw == null) { // unsupported CLI switch
          logger.log(Level.WARNING, "Unsupported switch: " + args[i] + ". Quitting.");
          System.exit(-1);
        }

        if (sw.getHasFollowingValue()) { // handle values for switches
          if (args.length > i + 1
              && !args[i + 1].startsWith(
                  "-")) { // we still have an array index after this one and it's not a switch
            sw.setValue(args[++i]);
          } else { // value is missing or malformed
            logger.log(
                Level.WARNING, "Invalid or missing parameter after " + args[i] + ". Quitting.");
            System.exit(-1);
          }
        } else { // activate the value-less switches
          sw.setValue(null);
        }
      } else { // assume we found the document's path/name
        docPath = args[i];
      }
    }

    // display help dialog if HELP-switch is given
    if (CLISwitch.HELP.getIsActive()) {
      printHelp();
      System.exit(0);
    }

    // start off with the verbosity recognition -- lots of the other
    // stuff can be skipped if this is set too high
    if (CLISwitch.VERBOSITY2.getIsActive()) {
      logger.setLevel(Level.ALL);
      logger.log(Level.INFO, "Verbosity: '-vv'; Logging level set to ALL.");

      // output the found language resource folders
      String languagesList = "";
      for (String language : ResourceScanner.getInstance().getDetectedResourceFolders()) {
        languagesList += System.getProperty("line.separator") + "- " + language;
      }
      logger.log(Level.INFO, "Listing detected language folders:" + languagesList);
    } else if (CLISwitch.VERBOSITY.getIsActive()) {
      logger.setLevel(Level.INFO);
      logger.log(Level.INFO, "Verbosity: '-v'; Logging level set to INFO and above.");
    } else {
      logger.setLevel(Level.WARNING);
      logger.log(
          Level.INFO,
          "Verbosity -v/-vv NOT FOUND OR RECOGNIZED; Logging level set to WARNING and above.");
    }

    // Check input encoding
    String encodingType = null;
    if (CLISwitch.ENCODING.getIsActive()) {
      encodingType = CLISwitch.ENCODING.getValue().toString();
      logger.log(Level.INFO, "Encoding '-e': " + encodingType);
    } else {
      // Encoding type not found
      encodingType = CLISwitch.ENCODING.getValue().toString();
      logger.log(Level.INFO, "Encoding '-e': NOT FOUND OR RECOGNIZED; set to 'UTF-8'");
    }

    // Check output format
    OutputType outputType = null;
    if (CLISwitch.OUTPUTTYPE.getIsActive()) {
      outputType = OutputType.valueOf(CLISwitch.OUTPUTTYPE.getValue().toString().toUpperCase());
      logger.log(Level.INFO, "Output '-o': " + outputType.toString().toUpperCase());
    } else {
      // Output type not found
      outputType = (OutputType) CLISwitch.OUTPUTTYPE.getValue();
      logger.log(
          Level.INFO,
          "Output '-o': NOT FOUND OR RECOGNIZED; set to " + outputType.toString().toUpperCase());
    }

    // Check language
    Language language = null;
    if (CLISwitch.LANGUAGE.getIsActive()) {
      language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue());

      if (language == Language.WILDCARD
          && !ResourceScanner.getInstance()
              .getDetectedResourceFolders()
              .contains(language.getName())) {
        logger.log(
            Level.SEVERE,
            "Language '-l': " + CLISwitch.LANGUAGE.getValue() + " NOT RECOGNIZED; aborting.");
        printHelp();
        System.exit(-1);
      } else {
        logger.log(Level.INFO, "Language '-l': " + language.getName());
      }
    } else {
      // Language not found
      language = Language.getLanguageFromString((String) CLISwitch.LANGUAGE.getValue());
      logger.log(
          Level.INFO, "Language '-l': NOT FOUND; set to " + language.toString().toUpperCase());
    }

    // Check type
    DocumentType type = null;
    if (CLISwitch.DOCTYPE.getIsActive()) {
      try {
        if (CLISwitch.DOCTYPE
            .getValue()
            .equals("narrative")) { // redirect "narrative" to "narratives"
          CLISwitch.DOCTYPE.setValue("narratives");
        }
        type = DocumentType.valueOf(CLISwitch.DOCTYPE.getValue().toString().toUpperCase());
      } catch (IllegalArgumentException e) {
        logger.log(
            Level.WARNING,
            "Type '-t': NOT RECOGNIZED. These are the available options: "
                + Arrays.asList(DocumentType.values()));
        System.exit(-1);
      }
      logger.log(Level.INFO, "Type '-t': " + type.toString().toUpperCase());
    } else {
      // Type not found
      type = (DocumentType) CLISwitch.DOCTYPE.getValue();
      logger.log(Level.INFO, "Type '-t': NOT FOUND; set to " + type.toString().toUpperCase());
    }

    // Check document creation time
    Date dct = null;
    if (CLISwitch.DCT.getIsActive()) {
      try {
        DateFormat formatter = new SimpleDateFormat("yyyy-MM-dd");
        dct = formatter.parse(CLISwitch.DCT.getValue().toString());
        logger.log(Level.INFO, "Document Creation Time '-dct': " + dct.toString());
      } catch (Exception e) {
        // DCT was not parseable
        logger.log(Level.WARNING, "Document Creation Time '-dct': NOT RECOGNIZED. Quitting.");
        printHelp();
        System.exit(-1);
      }
    } else {
      if ((type == DocumentType.NEWS) || (type == DocumentType.COLLOQUIAL)) {
        // Dct needed
        dct = (Date) CLISwitch.DCT.getValue();
        logger.log(
            Level.INFO,
            "Document Creation Time '-dct': NOT FOUND; set to local date ("
                + dct.toString()
                + ").");
      } else {
        logger.log(Level.INFO, "Document Creation Time '-dct': NOT FOUND; skipping.");
      }
    }

    // Handle locale switch
    String locale = (String) CLISwitch.LOCALE.getValue();
    Locale myLocale = null;
    if (CLISwitch.LOCALE.getIsActive()) {
      // check if the requested locale is available
      for (Locale l : Locale.getAvailableLocales()) {
        if (l.toString().toLowerCase().equals(locale.toLowerCase())) myLocale = l;
      }

      try {
        Locale.setDefault(myLocale); // try to set the locale
        logger.log(Level.INFO, "Locale '-locale': " + myLocale.toString());
      } catch (Exception e) { // if the above fails, spit out error message and available locales
        logger.log(
            Level.WARNING,
            "Supplied locale parameter couldn't be resolved to a working locale. Try one of these:");
        logger.log(
            Level.WARNING,
            Arrays.asList(Locale.getAvailableLocales()).toString()); // list available locales
        printHelp();
        System.exit(-1);
      }
    } else {
      // no -locale parameter supplied: just show default locale
      logger.log(
          Level.INFO,
          "Locale '-locale': NOT FOUND, set to environment locale: "
              + Locale.getDefault().toString());
    }

    // Read configuration from file
    String configPath = CLISwitch.CONFIGFILE.getValue().toString();
    try {
      logger.log(Level.INFO, "Configuration path '-c': " + configPath);

      readConfigFile(configPath);

      logger.log(Level.FINE, "Config initialized");
    } catch (Exception e) {
      e.printStackTrace();
      logger.log(
          Level.WARNING,
          "Config could not be initialized! Please supply the -c switch or "
              + "put a config.props into this directory.");
      printHelp();
      System.exit(-1);
    }

    // Set the preprocessing POS tagger
    POSTagger posTagger = null;
    if (CLISwitch.POSTAGGER.getIsActive()) {
      try {
        posTagger = POSTagger.valueOf(CLISwitch.POSTAGGER.getValue().toString().toUpperCase());
      } catch (IllegalArgumentException e) {
        logger.log(
            Level.WARNING,
            "Given POS Tagger doesn't exist. Please specify a valid one as listed in the help.");
        printHelp();
        System.exit(-1);
      }
      logger.log(Level.INFO, "POS Tagger '-pos': " + posTagger.toString().toUpperCase());
    } else {
      // Type not found
      posTagger = (POSTagger) CLISwitch.POSTAGGER.getValue();
      logger.log(
          Level.INFO,
          "POS Tagger '-pos': NOT FOUND OR RECOGNIZED; set to "
              + posTagger.toString().toUpperCase());
    }

    // Set whether or not to use the Interval Tagger
    Boolean doIntervalTagging = false;
    if (CLISwitch.INTERVALS.getIsActive()) {
      doIntervalTagging = CLISwitch.INTERVALS.getIsActive();
      logger.log(Level.INFO, "Interval Tagger '-it': " + doIntervalTagging.toString());
    } else {
      logger.log(
          Level.INFO,
          "Interval Tagger '-it': NOT FOUND OR RECOGNIZED; set to " + doIntervalTagging.toString());
    }

    // make sure we have a document path
    if (docPath == null) {
      logger.log(Level.WARNING, "No input file given; aborting.");
      printHelp();
      System.exit(-1);
    }

    // Run HeidelTime
    RandomAccessFile aFile = null;
    MappedByteBuffer buffer = null;
    FileChannel inChannel = null;
    PrintWriter pwOut = null;
    try {
      logger.log(Level.INFO, "Reading document using charset: " + encodingType);

      aFile = new RandomAccessFile(docPath, "r");
      inChannel = aFile.getChannel();
      buffer = inChannel.map(FileChannel.MapMode.READ_ONLY, 0, inChannel.size());
      buffer.load();
      byte[] inArr = new byte[(int) inChannel.size()];

      for (int i = 0; i < buffer.limit(); i++) {
        inArr[i] = buffer.get();
      }

      // double-newstring should not be necessary, but without this, it's not running on Windows (?)
      String input = new String(new String(inArr, encodingType).getBytes("UTF-8"), "UTF-8");

      HeidelTimeStandalone standalone =
          new HeidelTimeStandalone(language, type, outputType, null, posTagger, doIntervalTagging);
      String out = standalone.process(input, dct);

      // Print output always as UTF-8
      pwOut = new PrintWriter(new OutputStreamWriter(System.out, "UTF-8"));
      pwOut.println(out);
    } catch (Exception e) {
      e.printStackTrace();
    } finally {
      if (pwOut != null) {
        pwOut.close();
      }
      if (buffer != null) {
        buffer.clear();
      }
      if (inChannel != null) {
        try {
          inChannel.close();
        } catch (IOException e) {
        }
      }
      if (aFile != null) {
        try {
          aFile.close();
        } catch (IOException e) {
        }
      }
    }
  }
Ejemplo n.º 6
0
 /**
  * @see java.io.RandomAccessFile#length()
  * @return long length
  * @throws IOException
  */
 public long length() throws IOException {
   return mappedByteBuffer.limit();
 }
  /**
   * Experimental parser built to leverage multiple cores and keep up with the speed of modern SSDs
   *
   * @param fileChannel
   * @throws IOException
   */
  public void extract(FileChannel fileChannel) throws IOException {
    MappedByteBuffer mappedBuffer;

    long fileSize = fileChannel.size();
    long position = 0;
    int tailPadding = 8; // needed to cover the transition
    long blockSize = 1 << 25;

    TypeExtractor typeExtractor = new TypeExtractor(true /* force ASCII */);
    RecordFieldExtractor rfe = new RecordFieldExtractor();

    mappedBuffer =
        fileChannel.map(
            FileChannel.MapMode.READ_ONLY, position, Math.min(blockSize, fileSize - position));
    int padding = tailPadding;
    do {

      if (mappedBuffer.limit() + position == fileSize) {
        padding = 0;
      }

      int pos = 0;

      Pipe.setValue(
          rb.structuredLayoutRingBuffer,
          rb.mask,
          Pipe.getWorkingHeadPositionObject(rb).value++,
          pos);

      int tokenCount = 0;
      int c = 0;

      int j = mappedBuffer.remaining() - padding;
      do {
        // walk over the data while we have this section mapped.
        c++;

        byte b = (byte) mappedBuffer.get();

        //      	RecordFieldExtractor.appendContent(rfe, b); //TOO much work here must do on reading
        // thread.

        // TODO: check the field type sums
        // TODO: zero copy but we need to discover tokens

        // splits on returns, commas, dots and many other punctuation
        if (b < 48) {
          // System.err.println("char :"+b);

          // what mask can be built to combine the byte we are after.

          //	allTheBits++; //do something

          pos = mappedBuffer.position();
          Pipe.setValue(
              rb.structuredLayoutRingBuffer,
              rb.mask,
              Pipe.getWorkingHeadPositionObject(rb).value++,
              pos);

          if ((++tokenCount & 0xF) == 0) {
            Pipe.publishWrites(rb);
          }

          //	rb.reset();

        }

      } while (--j > 0);

      // this tokenizer assumes that the file ends with a field delimiter so the last record gets
      // flushed.

      // TODO: need to wait for threads to finish before swapping to new page or have multiple pages
      // to swap in/out

      // only increment by exactly how many bytes were read assuming we started at zero
      // can only cut at the last known record start
      position += c;

      System.out.println("bytes read so far:" + position);

      mappedBuffer =
          fileChannel.map(
              FileChannel.MapMode.READ_ONLY, position, Math.min(blockSize, fileSize - position));

    } while (position < fileSize);
  }
Ejemplo n.º 8
0
 /**
  * @see java.io.RandomAccessFile#length()
  * @return long length
  */
 public long length() {
   return mappedByteBuffer.limit();
 }