Java MemoryControl.request Examples

Programming Language: Java

Namespace/Package Name: net.yacy.kelondro.util

Class/Type: MemoryControl

Method/Function: request

Examples at hotexamples.com: 2

Java MemoryControl.request - 2 examples found. These are the top rated real world Java examples of net.yacy.kelondro.util.MemoryControl.request extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

available(2)

request(2)

getDHTallowed(1)

setDHTMbyte(1)

shortStatus(1)

Example #1

Show file

File: pdfParser.java Project: supertanglang/yacy_search_server

  @Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }

Example #2

Show file

File: IndexCell.java Project: sebastianscatularo/yacy

    private void flushBuffer() {

      // dump the cache if necessary
      final long t = System.currentTimeMillis();
      if ((IndexCell.this.ram.size() >= IndexCell.this.maxRamEntries
          || (IndexCell.this.ram.size() > 3000
              && !MemoryControl.request(80L * 1024L * 1024L, false))
          || (!IndexCell.this.ram.isEmpty() && IndexCell.this.lastDump + dumpCycle < t))) {
        synchronized (IndexCell.this.merger) {
          if (IndexCell.this.ram.size() >= IndexCell.this.maxRamEntries
              || (IndexCell.this.ram.size() > 3000
                  && !MemoryControl.request(80L * 1024L * 1024L, false))
              || (!IndexCell.this.ram.isEmpty() && IndexCell.this.lastDump + dumpCycle < t))
            try {
              IndexCell.this.lastDump = System.currentTimeMillis();
              // removed delayed
              try {
                removeDelayed();
              } catch (final IOException e) {
              }
              // dump the ram
              final File dumpFile = IndexCell.this.array.newContainerBLOBFile();
              // a critical point: when the ram is handed to the dump job,
              // don't write into it any more. Use a fresh one instead
              ReferenceContainerCache<ReferenceType> ramdump;
              final ByteOrder termOrder = IndexCell.this.ram.termKeyOrdering();
              final int termSize = IndexCell.this.ram.termKeyLength();
              synchronized (this) {
                ramdump = IndexCell.this.ram;
                // get a fresh ram cache
                IndexCell.this.ram =
                    new ReferenceContainerCache<ReferenceType>(
                        IndexCell.this.factory, termOrder, termSize);
              }
              // dump the buffer
              IndexCell.this.merger.dump(ramdump, dumpFile, IndexCell.this.array);
              IndexCell.this.lastDump = System.currentTimeMillis();
            } catch (final Throwable e) {
              // catch all exceptions
              ConcurrentLog.logException(e);
            }
        }
      }

      // clean-up the cache
      if ((IndexCell.this.array.entries() > 50 || IndexCell.this.lastCleanup + cleanupCycle < t)) {
        synchronized (IndexCell.this.array) {
          if (IndexCell.this.array.entries() > 50
              || (IndexCell.this.lastCleanup + cleanupCycle < System.currentTimeMillis()))
            try {
              IndexCell.this.lastCleanup =
                  System
                      .currentTimeMillis(); // set time to prevent that this is called to soon again
              IndexCell.this.shrink(IndexCell.this.targetFileSize, IndexCell.this.maxFileSize);
              IndexCell.this.lastCleanup =
                  System.currentTimeMillis(); // set again to mark end of procedure
            } catch (final Throwable e) {
              // catch all exceptions
              ConcurrentLog.logException(e);
            }
        }
      }
    }