@Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }
  /**
   * Starts the text extraction.
   *
   * @param args the commandline arguments.
   * @throws IOException if there is an error reading the document or extracting the text.
   */
  public void startExtraction(String[] args) throws IOException {
    boolean toConsole = false;
    boolean toHTML = false;
    boolean sort = false;
    boolean separateBeads = true;
    String password = "";
    String encoding = "UTF-8";
    String pdfFile = null;
    String outputFile = null;
    // Defaults to text files
    String ext = ".txt";
    int startPage = 1;
    int endPage = Integer.MAX_VALUE;
    for (int i = 0; i < args.length; i++) {
      switch (args[i]) {
        case PASSWORD:
          i++;
          if (i >= args.length) {
            usage();
          }
          password = args[i];
          break;
        case ENCODING:
          i++;
          if (i >= args.length) {
            usage();
          }
          encoding = args[i];
          break;
        case START_PAGE:
          i++;
          if (i >= args.length) {
            usage();
          }
          startPage = Integer.parseInt(args[i]);
          break;
        case HTML:
          toHTML = true;
          ext = ".html";
          break;
        case SORT:
          sort = true;
          break;
        case IGNORE_BEADS:
          separateBeads = false;
          break;
        case DEBUG:
          debug = true;
          break;
        case END_PAGE:
          i++;
          if (i >= args.length) {
            usage();
          }
          endPage = Integer.parseInt(args[i]);
          break;
        case CONSOLE:
          toConsole = true;
          break;
        default:
          if (pdfFile == null) {
            pdfFile = args[i];
          } else {
            outputFile = args[i];
          }
          break;
      }
    }

    if (pdfFile == null) {
      usage();
    } else {

      Writer output = null;
      PDDocument document = null;
      try {
        long startTime = startProcessing("Loading PDF " + pdfFile);
        if (outputFile == null && pdfFile.length() > 4) {
          outputFile = new File(pdfFile.substring(0, pdfFile.length() - 4) + ext).getAbsolutePath();
        }
        document = PDDocument.load(new File(pdfFile), password);

        AccessPermission ap = document.getCurrentAccessPermission();
        if (!ap.canExtractContent()) {
          throw new IOException("You do not have permission to extract text");
        }

        stopProcessing("Time for loading: ", startTime);

        if (toConsole) {
          output = new OutputStreamWriter(System.out, encoding);
        } else {
          output = new OutputStreamWriter(new FileOutputStream(outputFile), encoding);
        }

        PDFTextStripper stripper;
        if (toHTML) {
          stripper = new PDFText2HTML();
        } else {
          stripper = new PDFTextStripper();
        }
        stripper.setSortByPosition(sort);
        stripper.setShouldSeparateByBeads(separateBeads);
        stripper.setStartPage(startPage);
        stripper.setEndPage(endPage);

        startTime = startProcessing("Starting text extraction");
        if (debug) {
          System.err.println("Writing to " + outputFile);
        }

        // Extract text for main document:
        stripper.writeText(document, output);

        // ... also for any embedded PDFs:
        PDDocumentCatalog catalog = document.getDocumentCatalog();
        PDDocumentNameDictionary names = catalog.getNames();
        if (names != null) {
          PDEmbeddedFilesNameTreeNode embeddedFiles = names.getEmbeddedFiles();
          if (embeddedFiles != null) {
            Map<String, PDComplexFileSpecification> embeddedFileNames = embeddedFiles.getNames();
            if (embeddedFileNames != null) {
              for (Map.Entry<String, PDComplexFileSpecification> ent :
                  embeddedFileNames.entrySet()) {
                if (debug) {
                  System.err.println("Processing embedded file " + ent.getKey() + ":");
                }
                PDComplexFileSpecification spec = ent.getValue();
                PDEmbeddedFile file = spec.getEmbeddedFile();
                if (file != null && "application/pdf".equals(file.getSubtype())) {
                  if (debug) {
                    System.err.println("  is PDF (size=" + file.getSize() + ")");
                  }
                  InputStream fis = file.createInputStream();
                  PDDocument subDoc = null;
                  try {
                    subDoc = PDDocument.load(fis);
                  } finally {
                    fis.close();
                  }
                  try {
                    stripper.writeText(subDoc, output);
                  } finally {
                    IOUtils.closeQuietly(subDoc);
                  }
                }
              }
            }
          }
        }
        stopProcessing("Time for extraction: ", startTime);
      } finally {
        IOUtils.closeQuietly(output);
        IOUtils.closeQuietly(document);
      }
    }
  }