Java UTF8の例、net.yacy.cora.document.encoding.UTF8 Javaの例

コード例 #1

0

ファイルを表示

ファイル: FileUtils.java プロジェクト: debjian/yacy_search_server

 public static void saveMapB(
     final File file, final Map<String, byte[]> props, final String comment) {
   HashMap<String, String> m = new HashMap<String, String>();
   for (Map.Entry<String, byte[]> e : props.entrySet())
     m.put(e.getKey(), UTF8.String(e.getValue()));
   saveMap(file, m, comment);
 }

コード例 #2

0

ファイルを表示

ファイル: FileUtils.java プロジェクト: debjian/yacy_search_server

 public static ConcurrentHashMap<String, byte[]> loadMapB(final File f) {
   ConcurrentHashMap<String, String> m = loadMap(f);
   if (m == null) return null;
   ConcurrentHashMap<String, byte[]> mb = new ConcurrentHashMap<String, byte[]>();
   for (Map.Entry<String, String> e : m.entrySet())
     mb.put(e.getKey(), UTF8.getBytes(e.getValue()));
   return mb;
 }

コード例 #3

0

ファイルを表示

ファイル: BufferedRecords.java プロジェクト: smokingwheels/yacy_search_server

  /**
   * main - writes some data and checks the tables size (with time measureing)
   *
   * @param args
   */
  public static void main(final String[] args) {
    // open a file, add one entry and exit
    final File f = new File(args[0]);
    if (f.exists()) FileUtils.deletedelete(f);
    try {
      final Records t = new Records(f, 8);
      final byte[] b = new byte[8];
      t.add("01234567".getBytes(), 0);
      t.add("ABCDEFGH".getBytes(), 0);
      t.add("abcdefgh".getBytes(), 0);
      t.add("--------".getBytes(), 0);
      t.add("********".getBytes(), 0);
      for (int i = 0; i < 1000; i++) t.add("++++++++".getBytes(), 0);
      t.add("=======0".getBytes(), 0);
      t.add("=======1".getBytes(), 0);
      t.add("=======2".getBytes(), 0);
      t.cleanLast(b, 0);
      System.out.println(UTF8.String(b));
      t.cleanLast(b, 0);
      // t.clean(2, b, 0);
      System.out.println(UTF8.String(b));
      t.get(1, b, 0);
      System.out.println(UTF8.String(b));
      t.put(1, "AbCdEfGh".getBytes(), 0);
      t.get(1, b, 0);
      System.out.println(UTF8.String(b));
      t.get(3, b, 0);
      System.out.println(UTF8.String(b));
      t.get(4, b, 0);
      System.out.println(UTF8.String(b));
      System.out.println("size = " + t.size());
      // t.clean(t.size() - 2);
      t.cleanLast();
      final long start = System.currentTimeMillis();
      long c = 0;
      for (int i = 0; i < 100000; i++) {
        c = t.size();
      }
      System.out.println(
          "size() needs " + ((System.currentTimeMillis() - start) / 100) + " nanoseconds");
      System.out.println("size = " + c);

      t.close();
    } catch (final IOException e) {
      ConcurrentLog.logException(e);
    }
  }

コード例 #4

0

ファイルを表示

ファイル: torrentParser.java プロジェクト: otteresk/yacy_search_server

 @Override
 public Document[] parse(
     final DigestURL location,
     final String mimeType,
     final String charset,
     final VocabularyScraper scraper,
     final int timezoneOffset,
     final InputStream source)
     throws Parser.Failure, InterruptedException {
   byte[] b = null;
   try {
     b = FileUtils.read(source);
   } catch (final IOException e1) {
     throw new Parser.Failure(e1.toString(), location);
   }
   final BDecoder bd = new BDecoder(b);
   final BObject bo = bd.parse();
   if (bo == null) throw new Parser.Failure("BDecoder.parse returned null", location);
   if (bo.getType() != BType.dictionary)
     throw new Parser.Failure("BDecoder object is not a dictionary", location);
   final Map<String, BObject> map = bo.getMap();
   final BObject commento = map.get("comment");
   final String comment = (commento == null) ? "" : UTF8.String(commento.getString());
   // Date creation = new Date(map.get("creation date").getInteger());
   final BObject infoo = map.get("info");
   final StringBuilder filenames = new StringBuilder(80);
   String title = "";
   if (infoo != null) {
     final Map<String, BObject> info = infoo.getMap();
     final BObject fileso = info.get("files");
     if (fileso != null) {
       final List<BObject> filelist = fileso.getList();
       for (final BObject fo : filelist) {
         final BObject patho = fo.getMap().get("path");
         if (patho != null) {
           final List<BObject> l = patho.getList(); // one file may have several names
           for (final BObject fl : l) {
             filenames.append(fl.toString()).append(" ");
           }
         }
       }
     }
     final BObject nameo = info.get("name");
     if (nameo != null) title = UTF8.String(nameo.getString());
   }
   if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(location.getFileName());
   return new Document[] {
     new Document(
         location,
         mimeType,
         charset,
         this,
         null,
         null,
         singleList(title), // title
         comment, // author
         location.getHost(),
         null,
         null,
         0.0d,
         0.0d,
         filenames.toString(),
         null,
         null,
         null,
         false,
         new Date())
   };
 }

コード例 #5

0

ファイルを表示

ファイル: pdfParser.java プロジェクト: supertanglang/yacy_search_server

  @Override
  public Document[] parse(
      final AnchorURL location,
      final String mimeType,
      final String charset,
      final VocabularyScraper scraper,
      final int timezoneOffset,
      final InputStream source)
      throws Parser.Failure, InterruptedException {

    // check memory for parser
    if (!MemoryControl.request(200 * 1024 * 1024, false))
      throw new Parser.Failure(
          "Not enough Memory available for pdf parser: " + MemoryControl.available(), location);

    // create a pdf parser
    PDDocument pdfDoc;
    try {
      Thread.currentThread().setPriority(Thread.MIN_PRIORITY); // the pdfparser is a big pain
      // pdfDoc = PDDocument.load(source);
      final PDFParser pdfParser = new PDFParser(source);
      pdfParser.setTempDirectory(new File(System.getProperty("java.io.tmpdir")));
      pdfParser.parse();
      pdfDoc = pdfParser.getPDDocument();
    } catch (final IOException e) {
      throw new Parser.Failure(e.getMessage(), location);
    } finally {
      Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
    }

    if (pdfDoc.isEncrypted()) {
      try {
        pdfDoc.openProtection(new StandardDecryptionMaterial(""));
      } catch (final BadSecurityHandlerException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
      } catch (final IOException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
      } catch (final CryptographyException e) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
      }
      final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
      if (perm == null || !perm.canExtractContent()) {
        try {
          pdfDoc.close();
        } catch (final IOException ee) {
        }
        throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
      }
    }

    // extracting some metadata
    PDDocumentInformation info = pdfDoc.getDocumentInformation();
    String docTitle = null,
        docSubject = null,
        docAuthor = null,
        docPublisher = null,
        docKeywordStr = null;
    Date docDate = new Date();
    if (info != null) {
      docTitle = info.getTitle();
      docSubject = info.getSubject();
      docAuthor = info.getAuthor();
      docPublisher = info.getProducer();
      if (docPublisher == null || docPublisher.isEmpty()) docPublisher = info.getCreator();
      docKeywordStr = info.getKeywords();
      try {
        if (info.getModificationDate() != null) docDate = info.getModificationDate().getTime();
      } catch (IOException e) {
      }
      // unused:
      // info.getTrapped());
    }
    info = null;

    if (docTitle == null || docTitle.isEmpty()) {
      docTitle = MultiProtocolURL.unescape(location.getFileName());
    }
    if (docTitle == null) {
      docTitle = docSubject;
    }
    String[] docKeywords = null;
    if (docKeywordStr != null) {
      docKeywords = docKeywordStr.split(" |,");
    }

    Collection<AnchorURL>[] pdflinks = null;
    Document[] result = null;
    try {
      // get the links
      pdflinks = extractPdfLinks(pdfDoc);

      // get the fulltext (either per document or for each page)
      final PDFTextStripper stripper = new PDFTextStripper(StandardCharsets.UTF_8.name());

      if (individualPages) {
        // this is a hack which stores individual pages of the source pdf into individual index
        // documents
        // the new documents will get a virtual link with a post argument page=X appended to the
        // original url

        // collect text
        int pagecount = pdfDoc.getNumberOfPages();
        String[] pages = new String[pagecount];
        for (int page = 1; page <= pagecount; page++) {
          stripper.setStartPage(page);
          stripper.setEndPage(page);
          pages[page - 1] = stripper.getText(pdfDoc);
          // System.out.println("PAGE " + page + ": " + pages[page - 1]);
        }

        // create individual documents for each page
        assert pages.length == pdflinks.length
            : "pages.length = " + pages.length + ", pdflinks.length = " + pdflinks.length;
        result = new Document[Math.min(pages.length, pdflinks.length)];
        String loc = location.toNormalform(true);
        for (int page = 0; page < result.length; page++) {
          result[page] =
              new Document(
                  new AnchorURL(
                      loc
                          + (loc.indexOf('?') > 0 ? '&' : '?')
                          + individualPagePropertyname
                          + '='
                          + (page
                              + 1)), // these are virtual new pages; we cannot combine them with '#'
                                     // as that would be removed when computing the urlhash
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  pages == null || page > pages.length ? new byte[0] : UTF8.getBytes(pages[page]),
                  pdflinks == null || page >= pdflinks.length ? null : pdflinks[page],
                  null,
                  null,
                  false,
                  docDate);
        }
      } else {
        // collect the whole text at once
        final CharBuffer writer = new CharBuffer(odtParser.MAX_DOCSIZE);
        byte[] contentBytes = new byte[0];
        stripper.setEndPage(3); // get first 3 pages (always)
        writer.append(stripper.getText(pdfDoc));
        contentBytes = writer.getBytes(); // remember text in case of interrupting thread

        if (pdfDoc.getNumberOfPages() > 3) { // spare creating/starting thread if all pages read
          stripper.setStartPage(4); // continue with page 4 (terminated, resulting in no text)
          stripper.setEndPage(Integer.MAX_VALUE); // set to default
          // we start the pdf parsing in a separate thread to ensure that it can be terminated
          final PDDocument pdfDocC = pdfDoc;
          final Thread t =
              new Thread() {
                @Override
                public void run() {
                  Thread.currentThread().setName("pdfParser.getText:" + location);
                  try {
                    writer.append(stripper.getText(pdfDocC));
                  } catch (final Throwable e) {
                  }
                }
              };
          t.start();
          t.join(3000); // pdfbox likes to forget to terminate ... (quite often)
          if (t.isAlive()) t.interrupt();
        }
        contentBytes = writer.getBytes(); // get final text before closing writer

        Collection<AnchorURL> pdflinksCombined = new HashSet<AnchorURL>();
        for (Collection<AnchorURL> pdflinksx : pdflinks)
          if (pdflinksx != null) pdflinksCombined.addAll(pdflinksx);
        result =
            new Document[] {
              new Document(
                  location,
                  mimeType,
                  StandardCharsets.UTF_8.name(),
                  this,
                  null,
                  docKeywords,
                  singleList(docTitle),
                  docAuthor,
                  docPublisher,
                  null,
                  null,
                  0.0f,
                  0.0f,
                  contentBytes,
                  pdflinksCombined,
                  null,
                  null,
                  false,
                  docDate)
            };
      }
    } catch (final Throwable e) {
      // close the writer (in finally)
      // throw new Parser.Failure(e.getMessage(), location);
    } finally {
      try {
        pdfDoc.close();
      } catch (final Throwable e) {
      }
    }

    // clear resources in pdfbox. they say that is resolved but it's not. see:
    // https://issues.apache.org/jira/browse/PDFBOX-313
    // https://issues.apache.org/jira/browse/PDFBOX-351
    // https://issues.apache.org/jira/browse/PDFBOX-441
    // the pdfbox still generates enormeous number of object allocations and don't delete these
    // the following Object are statically stored and never flushed:
    // COSFloat, COSArray, COSInteger, COSObjectKey, COSObject, COSDictionary,
    // COSStream, COSString, COSName, COSDocument, COSInteger[], COSNull
    // the great number of these objects can easily be seen in Java Visual VM
    // we try to get this shit out of the memory here by forced clear calls, hope the best the
    // rubbish gets out.
    pdfDoc = null;
    clean_up_idiotic_PDFParser_font_cache_which_eats_up_tons_of_megabytes();

    return result;
  }

コード例 #6

0

ファイルを表示

ファイル: FileUtils.java プロジェクト: debjian/yacy_search_server

  /**
   * delete files and directories if a directory is not empty, delete also everything inside because
   * deletion sometimes fails on windows, there is also a windows exec included
   *
   * @param path
   */
  public static void deletedelete(final File path) {
    if (path == null || !path.exists()) {
      return;
    }

    // empty the directory first
    if (path.isDirectory()) {
      final String[] list = path.list();
      if (list != null) {
        for (final String s : list) {
          deletedelete(new File(path, s));
        }
      }
    }

    if (path.exists()) path.delete();
    /*
    int c = 0;
    while ( c++ < 20 ) {
        if ( !path.exists() ) {
            break;
        }
        if ( path.delete() ) {
            break;
        }
        // some OS may be slow when giving up file pointer
        //System.runFinalization();
        //System.gc();
        try {
            Thread.sleep(200);
        } catch (final InterruptedException e ) {
            break;
        }
    }
    */
    if (path.exists()) {
      path.deleteOnExit();
      String p = "";
      try {
        p = path.getCanonicalPath();
      } catch (final IOException e1) {
        ConcurrentLog.logException(e1);
      }
      if (System.getProperties().getProperty("os.name", "").toLowerCase().startsWith("windows")) {
        // deleting files on windows sometimes does not work with java
        try {
          final String command = "cmd /C del /F /Q \"" + p + "\"";
          final Process r = Runtime.getRuntime().exec(command);
          if (r == null) {
            ConcurrentLog.severe("FileUtils", "cannot execute command: " + command);
          } else {
            final byte[] response = read(r.getInputStream());
            ConcurrentLog.info("FileUtils", "deletedelete: " + UTF8.String(response));
          }
        } catch (final IOException e) {
          ConcurrentLog.logException(e);
        }
      }
      if (path.exists()) {
        ConcurrentLog.severe("FileUtils", "cannot delete file " + p);
      }
    }
  }

コード例 #7

0

ファイルを表示

ファイル: HTTPDemon.java プロジェクト: smokingwheels/yacy_search_server

  static final void sendRespondHeader(
      final HashMap<String, Object> conProp,
      final OutputStream respond,
      String httpVersion,
      final int httpStatusCode,
      String httpStatusText,
      ResponseHeader responseHeader)
      throws IOException {

    if (respond == null) throw new NullPointerException("The outputstream must not be null.");
    if (conProp == null)
      throw new NullPointerException("The connection property structure must not be null.");
    if (httpVersion == null)
      httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER);
    if (httpVersion == null) httpVersion = HeaderFramework.HTTP_VERSION_1_1;
    if (responseHeader == null) responseHeader = new ResponseHeader(httpStatusCode);

    try {
      if ((httpStatusText == null) || (httpStatusText.length() == 0)) {
        if (HeaderFramework.http1_1.containsKey(Integer.toString(httpStatusCode)))
          // http1_1 includes http1_0 messages
          httpStatusText = HeaderFramework.http1_1.get(Integer.toString(httpStatusCode));
        else httpStatusText = "Unknown";
      }

      final StringBuilder header = new StringBuilder(560);

      // "HTTP/0.9" does not have a status line or header in the response
      if (!httpVersion.toUpperCase().equals(HeaderFramework.HTTP_VERSION_0_9)) {
        // write status line
        header
            .append(httpVersion)
            .append(" ")
            .append(Integer.toString(httpStatusCode))
            .append(" ")
            .append(httpStatusText)
            .append("\r\n");

        // prepare header
        if (!responseHeader.containsKey(HeaderFramework.DATE))
          responseHeader.put(HeaderFramework.DATE, HeaderFramework.formatRFC1123(new Date()));
        if (!responseHeader.containsKey(HeaderFramework.CONTENT_TYPE))
          responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/html; charset=UTF-8"); // fix this
        if (!responseHeader.containsKey(RequestHeader.CONNECTION)
            && conProp.containsKey(HeaderFramework.CONNECTION_PROP_PERSISTENT))
          responseHeader.put(
              RequestHeader.CONNECTION,
              (String) conProp.get(HeaderFramework.CONNECTION_PROP_PERSISTENT));
        if (!responseHeader.containsKey(RequestHeader.PROXY_CONNECTION)
            && conProp.containsKey(HeaderFramework.CONNECTION_PROP_PERSISTENT))
          responseHeader.put(
              RequestHeader.PROXY_CONNECTION,
              (String) conProp.get(HeaderFramework.CONNECTION_PROP_PERSISTENT));

        if (conProp.containsKey(HeaderFramework.CONNECTION_PROP_PERSISTENT)
            && conProp.get(HeaderFramework.CONNECTION_PROP_PERSISTENT).equals("keep-alive")
            && !responseHeader.containsKey(HeaderFramework.TRANSFER_ENCODING)
            && !responseHeader.containsKey(HeaderFramework.CONTENT_LENGTH))
          responseHeader.put(HeaderFramework.CONTENT_LENGTH, "0");

        // read custom headers
        final Iterator<ResponseHeader.Entry> it =
            responseHeader.getAdditionalHeaderProperties().iterator();
        ResponseHeader.Entry e;
        while (it.hasNext()) {
          // Append user properties to the main String
          // TODO: Should we check for user properites. What if they intersect properties that are
          // already in header?
          e = it.next();
          header.append(e.getKey()).append(": ").append(e.getValue()).append("\r\n");
        }

        // write header
        final Iterator<String> i = responseHeader.keySet().iterator();
        String key;
        char tag;
        int count;
        while (i.hasNext()) {
          key = i.next();
          tag = key.charAt(0);
          if ((tag != '*')
              && (tag != '#')) { // '#' in key is reserved for proxy attributes as artificial header
            // values
            count = responseHeader.keyCount(key);
            for (int j = 0; j < count; j++) {
              header
                  .append(key)
                  .append(": ")
                  .append(responseHeader.getSingle(key, j))
                  .append("\r\n");
            }
          }
        }

        // end header
        header.append("\r\n");

        // sending headers to the client
        respond.write(UTF8.getBytes(header.toString()));

        // flush stream
        respond.flush();
      }

      conProp.put(HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_HEADER, responseHeader);
      conProp.put(
          HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS, Integer.toString(httpStatusCode));
    } catch (final Exception e) {
      // any interruption may be caused be network error or because the user has closed
      // the windows during transmission. We simply pass it as IOException
      throw new IOException(e.getMessage());
    }
  }

コード例 #8

0

ファイルを表示

ファイル: HTTPDemon.java プロジェクト: smokingwheels/yacy_search_server

  private static final void sendRespondError(
      final HashMap<String, Object> conProp,
      final OutputStream respond,
      final int errorcase,
      final int httpStatusCode,
      String httpStatusText,
      final String detailedErrorMsgText,
      final Object detailedErrorMsgFile,
      final serverObjects detailedErrorMsgValues,
      final Throwable stackTrace,
      ResponseHeader header)
      throws IOException {

    FileInputStream fis = null;
    ByteArrayOutputStream o = null;
    try {
      // setting the proper http status message
      String httpVersion = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HTTP_VER);
      if (httpVersion == null) httpVersion = "HTTP/1.1";
      if ((httpStatusText == null) || (httpStatusText.length() == 0)) {
        // http1_1 includes http1_0 messages
        if (HeaderFramework.http1_1.containsKey(Integer.toString(httpStatusCode)))
          httpStatusText = HeaderFramework.http1_1.get(Integer.toString(httpStatusCode));
        else httpStatusText = "Unknown";
      }

      // generating the desired request url
      String host = (String) conProp.get(HeaderFramework.CONNECTION_PROP_HOST);
      String path = (String) conProp.get(HeaderFramework.CONNECTION_PROP_PATH);
      if (path == null) path = "/";
      final String args = (String) conProp.get(HeaderFramework.CONNECTION_PROP_ARGS);
      final String method = (String) conProp.get(HeaderFramework.CONNECTION_PROP_METHOD);

      final int port = Domains.stripToPort(host);
      host = Domains.stripToHostName(host);

      String urlString;
      try {
        urlString =
            (new DigestURL(
                    (method.equals(HeaderFramework.METHOD_CONNECT) ? "https" : "http"),
                    host,
                    port,
                    (args == null) ? path : path + "?" + args))
                .toString();
      } catch (final MalformedURLException e) {
        urlString = "invalid URL";
      }

      // set rewrite values
      final serverObjects tp = new serverObjects();

      String clientIP = (String) conProp.get(HeaderFramework.CONNECTION_PROP_CLIENTIP);
      if (clientIP == null) clientIP = Domains.LOCALHOST;

      tp.put("peerName", (switchboard.peers == null) ? "" : switchboard.peers.myName());
      tp.put("errorMessageType", Integer.toString(errorcase));
      tp.put("httpStatus", Integer.toString(httpStatusCode) + " " + httpStatusText);
      tp.put("requestMethod", (String) conProp.get(HeaderFramework.CONNECTION_PROP_METHOD));
      tp.put("requestURL", urlString);

      switch (errorcase) {
        case ERRORCASE_FILE:
          tp.put(
              "errorMessageType_file",
              (detailedErrorMsgFile == null) ? "" : detailedErrorMsgFile.toString());
          if ((detailedErrorMsgValues != null) && !detailedErrorMsgValues.isEmpty()) {
            // rewriting the value-names and add the proper name prefix:
            for (final Entry<String, String> entry : detailedErrorMsgValues.entrySet()) {
              tp.put("errorMessageType_" + entry.getKey(), entry.getValue());
            }
          }
          break;
        case ERRORCASE_MESSAGE:
        default:
          tp.put(
              "errorMessageType_detailedErrorMsg",
              (detailedErrorMsgText == null)
                  ? ""
                  : detailedErrorMsgText.replaceAll("\n", "<br />"));
          break;
      }

      // building the stacktrace
      if (stackTrace != null) {
        tp.put("printStackTrace", "1");
        final ByteBuffer errorMsg = new ByteBuffer(100);
        final PrintStream printStream = new PrintStream(errorMsg);
        stackTrace.printStackTrace(printStream);
        tp.put("printStackTrace_exception", stackTrace.toString());
        tp.put("printStackTrace_stacktrace", UTF8.String(errorMsg.getBytes()));
        printStream.close();
      } else {
        tp.put("printStackTrace", "0");
      }

      // Generated Tue, 23 Aug 2005 11:19:14 GMT by brain.wg (squid/2.5.STABLE3)
      // adding some system information
      final String systemDate = HeaderFramework.formatRFC1123(new Date());
      tp.put("date", systemDate);

      // rewrite the file
      final File htRootPath =
          new File(
              switchboard.getAppPath(),
              switchboard.getConfig(
                  SwitchboardConstants.HTROOT_PATH, SwitchboardConstants.HTROOT_PATH_DEFAULT));

      TemplateEngine.writeTemplate(
          "/proxymsg/error.html",
          fis = new FileInputStream(new File(htRootPath, "/proxymsg/error.html")),
          o = new ByteArrayOutputStream(512),
          tp);
      final byte[] result = o.toByteArray();
      o.close();
      o = null;

      if (header == null) header = new ResponseHeader(httpStatusCode);
      header.put(
          HeaderFramework.CONNECTION_PROP_PROXY_RESPOND_STATUS, Integer.toString(httpStatusCode));
      header.put(HeaderFramework.DATE, systemDate);
      header.put(HeaderFramework.CONTENT_TYPE, "text/html");
      header.put(HeaderFramework.CONTENT_LENGTH, Integer.toString(result.length));
      header.put(HeaderFramework.PRAGMA, "no-cache, no-store");
      sendRespondHeader(conProp, respond, httpVersion, httpStatusCode, httpStatusText, header);

      if (!method.equals(HeaderFramework.METHOD_HEAD)) {
        // write the array to the client
        FileUtils.copy(result, respond);
      }
      respond.flush();
    } finally {
      if (fis != null)
        try {
          fis.close();
        } catch (final Exception e) {
          ConcurrentLog.logException(e);
        }
      if (o != null)
        try {
          o.close();
        } catch (final Exception e) {
          ConcurrentLog.logException(e);
        }
    }
  }

コード例 #9

0

ファイルを表示

ファイル: HostQueue.java プロジェクト: smokingwheels/yacy_search_server

  @Override
  public Request pop(boolean delay, CrawlSwitchboard cs, RobotsTxt robots) throws IOException {
    // returns a crawl entry from the stack and ensures minimum delta times
    long sleeptime = 0;
    Request crawlEntry = null;
    CrawlProfile profileEntry = null;
    synchronized (this) {
      mainloop:
      while (true) {
        Index depthStack = getLowestStack();
        if (depthStack == null) return null;
        Row.Entry rowEntry = null;
        while (depthStack.size() > 0) {
          rowEntry = depthStack.removeOne();
          if (rowEntry != null) break;
        }
        if (rowEntry == null) continue mainloop;
        crawlEntry = new Request(rowEntry);

        // check blacklist (again) because the user may have created blacklist entries after the
        // queue has been filled
        if (Switchboard.urlBlacklist.isListed(BlacklistType.CRAWLER, crawlEntry.url())) {
          if (log.isFine()) log.fine("URL '" + crawlEntry.url() + "' is in blacklist.");
          continue mainloop;
        }

        // at this point we must check if the crawlEntry has relevance because the crawl profile
        // still exists
        // if not: return null. A calling method must handle the null value and try again
        profileEntry = cs.get(UTF8.getBytes(crawlEntry.profileHandle()));
        if (profileEntry == null) {
          if (log.isFine()) log.fine("no profile entry for handle " + crawlEntry.profileHandle());
          continue mainloop;
        }

        // depending on the caching policy we need sleep time to avoid DoS-like situations
        sleeptime = Latency.getDomainSleepTime(robots, profileEntry, crawlEntry.url());
        break;
      }
    }
    if (crawlEntry == null) return null;
    ClientIdentification.Agent agent =
        profileEntry == null
            ? ClientIdentification.yacyInternetCrawlerAgent
            : profileEntry.getAgent();
    long robotsTime = Latency.getRobotsTime(robots, crawlEntry.url(), agent);
    Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
    if (delay && sleeptime > 0) {
      // force a busy waiting here
      // in best case, this should never happen if the balancer works properly
      // this is only to protection against the worst case, where the crawler could
      // behave in a DoS-manner
      if (log.isInfo())
        log.info(
            "forcing crawl-delay of "
                + sleeptime
                + " milliseconds for "
                + crawlEntry.url().getHost()
                + ": "
                + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent));
      long loops = sleeptime / 1000;
      long rest = sleeptime % 1000;
      if (loops < 3) {
        rest = rest + 1000 * loops;
        loops = 0;
      }
      Thread.currentThread()
          .setName(
              "Balancer waiting for "
                  + crawlEntry.url().getHost()
                  + ": "
                  + sleeptime
                  + " milliseconds");
      synchronized (this) {
        // must be synchronized here to avoid 'takeover' moves from other threads which then idle
        // the same time which would not be enough
        if (rest > 0) {
          try {
            this.wait(rest);
          } catch (final InterruptedException e) {
          }
        }
        for (int i = 0; i < loops; i++) {
          if (log.isInfo())
            log.info(
                "waiting for "
                    + crawlEntry.url().getHost()
                    + ": "
                    + (loops - i)
                    + " seconds remaining...");
          try {
            this.wait(1000);
          } catch (final InterruptedException e) {
          }
        }
      }
      Latency.updateAfterSelection(crawlEntry.url(), robotsTime);
    }
    return crawlEntry;
  }