Exemple #1
0
 /**
  * Returns a reader initialized to read header lines. The reader is configured to expect
  * ISO-8859-1 encoding, LWS, quoted text and encoded words. Besides reading key:value headers it
  * will also read and return normal lines as defined in the method above.
  *
  * @return a reader to read header lines
  */
 public static HeaderLineReader getHeaderLineReader() {
   HeaderLineReader hlr = new HeaderLineReader();
   hlr.bNameValue = true;
   hlr.encoding = ENC_ISO8859_1;
   // hlr.eol
   hlr.bLWS = true;
   hlr.bQuotedText = true;
   hlr.bEncodedWords = true;
   return hlr;
 }
 @Test
 public void test_headerlinereader_trim() {
   StringBuffer sb = new StringBuffer();
   sb.setLength(0);
   sb.append("");
   Assert.assertEquals("", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append(" ");
   Assert.assertEquals("", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("  ");
   Assert.assertEquals("", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("   ");
   Assert.assertEquals("", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("text");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append(" text");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("  text");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("   text");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("text ");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("text  ");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("text   ");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append(" text ");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("  text  ");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("   text   ");
   Assert.assertEquals("text", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append(" \u0001text\u0002 ");
   Assert.assertEquals("\u0001text\u0002", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("  \u0001text\u0002  ");
   Assert.assertEquals("\u0001text\u0002", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("   \u0001text\u0002   ");
   Assert.assertEquals("\u0001text\u0002", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("  \u0001 text \u0002  ");
   Assert.assertEquals("\u0001 text \u0002", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append(" \u0001  text  \u0002 ");
   Assert.assertEquals("\u0001  text  \u0002", HeaderLineReader.trim(sb));
   sb.setLength(0);
   sb.append("\u0001   text   \u0002");
   Assert.assertEquals("\u0001   text   \u0002", HeaderLineReader.trim(sb));
 }
 @Test
 public void test_linereader_lines() {
   byte[] utf8Str;
   byte[] partialUtf8Str = null;
   try {
     utf8Str = "WARCæøå\u1234".getBytes("UTF-8");
     partialUtf8Str = new byte[utf8Str.length - 1];
     System.arraycopy(utf8Str, 0, partialUtf8Str, 0, utf8Str.length - 1);
   } catch (UnsupportedEncodingException e) {
     e.printStackTrace();
     Assert.fail("Unexpected exception!");
   }
   try {
     commonCases =
         new Object[][] {
           {"WARC/1.0".getBytes(), HeaderLine.HLT_LINE, "WARC/1.0", HeaderLineReader.E_BIT_EOF},
           {
             "WARC/1.0\n".getBytes(),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_MISSING_CR
           },
           {
             "WARC\r/1.0\n".getBytes(),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_MISPLACED_CR | HeaderLineReader.E_BIT_MISSING_CR
           },
           {"WARC/1.0\r\n".getBytes(), HeaderLine.HLT_LINE, "WARC/1.0", 0},
         };
     /*
      * Raw.
      */
     cases =
         new Object[][] {
           {"WARC/1.0\u0001\r\n".getBytes(), HeaderLine.HLT_LINE, "WARC/1.0\u0001", 0},
           {"WARCæøå\u1234/1.0\r\n".getBytes("ISO8859-1"), HeaderLine.HLT_LINE, "WARCæøå?/1.0", 0},
           {
             "WARCæøå\u1234/1.0\r\n".getBytes("UTF-8"), HeaderLine.HLT_LINE, "WARCæøåሴ/1.0", 0
           },
           {partialUtf8Str, HeaderLine.HLT_LINE, "WARCæøåáˆ", HeaderLineReader.E_BIT_EOF}
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_RAW;
     test_line_cases(commonCases);
     test_line_cases(cases);
     /*
      * US-ASCII.
      */
     cases =
         new Object[][] {
           {
             "WARC/1.0\u0001\r\n".getBytes(),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_INVALID_CONTROL_CHAR
           },
           {
             "WARCæøå\u1234/1.0\r\n".getBytes("ISO8859-1"),
             HeaderLine.HLT_LINE,
             "WARC?/1.0",
             HeaderLineReader.E_BIT_INVALID_US_ASCII_CHAR
           },
           {
             "WARCæøå\u1234/1.0\r\n".getBytes("UTF-8"),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_INVALID_US_ASCII_CHAR
           },
           {
             partialUtf8Str,
             HeaderLine.HLT_LINE,
             "WARC",
             HeaderLineReader.E_BIT_INVALID_US_ASCII_CHAR | HeaderLineReader.E_BIT_EOF
           }
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_US_ASCII;
     test_line_cases(commonCases);
     test_line_cases(cases);
     /*
      * ISO8859-1.
      */
     cases =
         new Object[][] {
           {
             "WARC/1.0\u0001\r\n".getBytes(),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_INVALID_CONTROL_CHAR
           },
           {"WARCæøå\u1234/1.0\r\n".getBytes("ISO8859-1"), HeaderLine.HLT_LINE, "WARCæøå?/1.0", 0},
           {
             "WARCæøå\u1234/1.0\r\n".getBytes("UTF-8"), HeaderLine.HLT_LINE, "WARCæøåሴ/1.0", 0
           },
           {partialUtf8Str, HeaderLine.HLT_LINE, "WARCæøåáˆ", HeaderLineReader.E_BIT_EOF}
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_ISO8859_1;
     test_line_cases(commonCases);
     test_line_cases(cases);
     /*
      * UTF-8.
      */
     cases =
         new Object[][] {
           {
             "WARC/1.0\u0001\r\n".getBytes(),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_INVALID_CONTROL_CHAR
           },
           {
             "WARCæøå/1.0\r\n".getBytes("ISO8859-1"),
             HeaderLine.HLT_LINE,
             "WARC1.0",
             HeaderLineReader.E_BIT_INVALID_UTF8_ENCODING
           },
           {
             "WARCæøå\u1234/1.0\r\n".getBytes("ISO8859-1"),
             HeaderLine.HLT_LINE,
             "WARC/1.0",
             HeaderLineReader.E_BIT_INVALID_UTF8_ENCODING
           },
           {
             "WARCæøå\u1234/1.0\r\n".getBytes("UTF-8"), HeaderLine.HLT_LINE, "WARCæøå\u1234/1.0", 0
           },
           {partialUtf8Str, HeaderLine.HLT_LINE, "WARCæøå", HeaderLineReader.E_BIT_EOF}
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_UTF8;
     test_line_cases(commonCases);
     test_line_cases(cases);
   } catch (IOException e) {
     e.printStackTrace();
     Assert.fail("Unexpected exception!");
   }
 }
 @Test
 public void test_linereader_headerlines() {
   byte[] utf8Str;
   byte[] partialUtf8Str = null;
   try {
     utf8Str = "WARCæøå\u1234".getBytes("UTF-8");
     partialUtf8Str = new byte[utf8Str.length - 1];
     System.arraycopy(utf8Str, 0, partialUtf8Str, 0, utf8Str.length - 1);
   } catch (UnsupportedEncodingException e) {
     e.printStackTrace();
     Assert.fail("Unexpected exception!");
   }
   try {
     commonCases =
         new Object[][] {
           {
             "content-type: monkeys".getBytes(),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_EOF
               }
             }
           },
           {
             "content-type: monkeys\r\n and\r\n poo".getBytes(),
             new Object[][] {
               {HeaderLine.HLT_LINE, "content-type: monkeys", null, null, 0},
               {HeaderLine.HLT_LINE, " and", null, null, 0},
               {HeaderLine.HLT_LINE, " poo", null, null, HeaderLineReader.E_BIT_EOF}
             }
           },
           {
             "content-type: monkeys\r\n".getBytes(),
             new Object[][] {{HeaderLine.HLT_LINE, "content-type: monkeys", null, null, 0}}
           },
           {
             "content-type\r: monkeys\r\n and\r\n poo\n".getBytes(),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_MISPLACED_CR
               },
               {HeaderLine.HLT_LINE, " and", null, null, 0},
               {HeaderLine.HLT_LINE, " poo", null, null, HeaderLineReader.E_BIT_MISSING_CR}
             }
           },
         };
     /*
      * Raw.
      */
     cases =
         new Object[][] {
           {
             "content-type: monkeys\u0001\r\n".getBytes(),
             new Object[][] {{HeaderLine.HLT_LINE, "content-type: monkeys\u0001", null, null, 0}}
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("ISO8859-1"),
             new Object[][] {{HeaderLine.HLT_LINE, "content-type: monkeysæøå?", null, null, 0}}
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("UTF-8"),
             new Object[][] {
               {HeaderLine.HLT_LINE, "content-type: monkeysæøåሴ", null, null, 0}
             }
           },
           {
             partialUtf8Str,
             new Object[][] {
               {HeaderLine.HLT_LINE, "WARCæøåáˆ", null, null, HeaderLineReader.E_BIT_EOF}
             }
           }
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_RAW;
     test_headerline_cases(commonCases);
     test_headerline_cases(cases);
     /*
      * US-ASCII.
      */
     cases =
         new Object[][] {
           {
             "content-type: monkeys\u0001\r\n".getBytes(),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_CONTROL_CHAR
               }
             }
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("ISO8859-1"),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys?",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_US_ASCII_CHAR
               }
             }
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("UTF-8"),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_US_ASCII_CHAR
               }
             }
           },
           {
             partialUtf8Str,
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "WARC",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_US_ASCII_CHAR | HeaderLineReader.E_BIT_EOF
               }
             }
           }
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_US_ASCII;
     test_headerline_cases(commonCases);
     test_headerline_cases(cases);
     /*
      * ISO8859-1.
      */
     cases =
         new Object[][] {
           {
             "content-type: monkeys\u0001\r\n".getBytes(),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_CONTROL_CHAR
               }
             }
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("ISO8859-1"),
             new Object[][] {{HeaderLine.HLT_LINE, "content-type: monkeysæøå?", null, null, 0}}
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("UTF-8"),
             new Object[][] {
               {HeaderLine.HLT_LINE, "content-type: monkeysæøåሴ", null, null, 0}
             }
           },
           {
             partialUtf8Str,
             new Object[][] {
               {HeaderLine.HLT_LINE, "WARCæøåáˆ", null, null, HeaderLineReader.E_BIT_EOF}
             }
           }
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_ISO8859_1;
     test_headerline_cases(commonCases);
     test_headerline_cases(cases);
     /*
      * UTF-8.
      */
     cases =
         new Object[][] {
           {
             "content-type: monkeys\u0001\r\n".getBytes(),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_CONTROL_CHAR
               }
             }
           },
           {
             "content-type: monkeysæøå\r\n".getBytes("ISO8859-1"),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_UTF8_ENCODING | HeaderLineReader.E_BIT_MISSING_CR
               }
             }
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("ISO8859-1"),
             new Object[][] {
               {
                 HeaderLine.HLT_LINE,
                 "content-type: monkeys",
                 null,
                 null,
                 HeaderLineReader.E_BIT_INVALID_UTF8_ENCODING
               }
             }
           },
           {
             "content-type: monkeysæøå\u1234\r\n".getBytes("UTF-8"),
             new Object[][] {
               {HeaderLine.HLT_LINE, "content-type: monkeysæøå\u1234", null, null, 0}
             }
           },
           {
             partialUtf8Str,
             new Object[][] {
               {HeaderLine.HLT_LINE, "WARCæøå", null, null, HeaderLineReader.E_BIT_EOF}
             }
           }
         };
     hlr = HeaderLineReader.getLineReader();
     hlr.encoding = HeaderLineReader.ENC_UTF8;
     test_headerline_cases(commonCases);
     test_headerline_cases(cases);
   } catch (IOException e) {
     e.printStackTrace();
     Assert.fail("Unexpected exception!");
   }
 }
Exemple #5
0
 /**
  * Returns a reader initialized to read normal lines. Normal lines being lines with no LWS or
  * key:value headers. The reader is configured to expect US-ASCII characters.
  *
  * @return a reader to read normal lines
  */
 public static HeaderLineReader getLineReader() {
   HeaderLineReader hlr = new HeaderLineReader();
   hlr.bNameValue = false;
   hlr.encoding = ENC_US_ASCII;
   return hlr;
 }