コード例 #1
0
ファイル: TyPun.java プロジェクト: rbazaud/TyPun
  private void initParam() {
    lg.entering(TyPun.class.getName(), "initParam");

    /* keep track of parameter’s values that will be used */
    lg.config(
        String.format("Use typographic punctuation : %s", tc.do_UsePunctuation() ? "YES" : "NO"));
    lg.config(String.format("Use ligatures : %s", tc.do_UseLigatures() ? "YES" : "NO"));
    lg.config(String.format("Use old ligatures : %s", tc.do_UseOldLigatures() ? "YES" : "NO"));
    lg.config(String.format("Use French quotes : %s", tc.do_UseFrenchQuotes() ? "YES" : "NO"));
    lg.config(String.format("Use old style numbers : %s", tc.do_UseOldStyleNums() ? "YES" : "NO"));
    lg.config(String.format("Use French spacing : %s", tc.do_UseFrenchSpacing() ? "YES" : "NO"));
    lg.config(String.format("Use typographic dashes : %s", tc.do_UseTypoDash() ? "YES" : "NO"));

    /* left and right double quotes are currently defined at compile-time
     * but the code needs very few changes to make it dynamic. That way we
     * may use English or French quotes running the exact same code */
    // ldquo = bUseFrenchQuotes ? "\u00ab" : "\u201c"; /* U+00ab « U+201c “ */
    // rdquo = bUseFrenchQuotes ? "\u00bb" : "\u201d"; /* U+00bb » U+201d ” */
    ldquo = tc.do_UseFrenchQuotes() ? "\u00ab" : "\u201c"; /* U+00ab « U+201c “ */
    rdquo = tc.do_UseFrenchQuotes() ? "\u00bb" : "\u201d"; /* U+00bb » U+201d ” */

    tydash = "\u2013";
    /* tydash = "\u2014"; */

    lg.exiting(TyPun.class.getName(), "initParam");
  }
コード例 #2
0
ファイル: TyPun.java プロジェクト: rbazaud/TyPun
  private String processLineSeq(String line) {
    /* should probably be left commented out (unless debugging) */
    // lg.entering (getClass ().getName (), String.format ("processLineSeq (%s)", line));

    /* keep track of line length so that we avoid overflows and/or
     * reallocations */
    int len = line.length();

    /* StringBuilder is a faster StringBuffer for single-threaded apps, we
     * define the capacity as length of line (+ a margin of 7 since
     * ligatures are multibytes characters) before processing which avoids
     * any unneeded realloc */
    StringBuilder pline = new StringBuilder(len + 7); /* processed line */
    /* For instance, the f_i ligature code point is U+fb01, which needs
     * 3 bytes to be written in UTF-8 (you may check with any hexdump
     * program that fi is indeed written as 0xef 0xac 0x81 so we are going to
     * need (3-2=)1 more byte. For the f_f_i ligature (ffi) whose codepoint is
     * U+fb03 and UTF-8 encoding is 0xef 0xac 0x83 we are using exactly the
     * same number of bytes (in UTF-8). Same principle applies to the
     * ellipsis, in ASCII encoding we need 3 bytes (one for each dot 0x2e),
     * in UTF-8 we also need 3 bytes to encode … / U+2026 i.e., 0xe2 0x80
     * 0xa6 */

    for (int index = 0; index < len; index++) {

      char c = line.charAt(index);

      if (bIsOpenTag) {

        /* if we are inside a tag we don’t want to change anything
         * otherwise we might end up with an unreadable hexadecimal
         * number (using ff ligature), an unknow tag name, … */
        pline.append(c);
        /* TBD: we could add a test to check for malformed HTML tags
         * e.g., <xxx <yyy> zzz> */
        if (c == '>') bIsOpenTag = false;

      } else {

        switch (c) {

            // case ',': /* to speed up processing a bit as we don’t need to */
            // case ' ': /* pass every test on these glyphs */
            // pline.append (c);
            // break;

            // ij and IJ ligatures deactivated since with some fonts
            // e.g., New Century Schoolbook, kerning is *really* bad
            // for instance in “ouija” or “mija” the ij is overlapping
            // the letter right after while being very far from the
            // letter before.
            //
            // case 'i': /* most frequent case first */
            //     // System.out.println ("we found a i");
            //     /* check first it is not the end */
            //     if (tc.do_UseLigatures () && index + 1 < len) {
            //         /* then check following char for a ‘j’ */
            //         if (line.charAt (index + 1) == 'j') {

            //             pline.append ('\u0133'); /* ij */
            //             ++index;

            //         } else pline.append (c);

            //     } else pline.append (c);

            //     break;

            // case 'I':
            //     // System.out.println ("we found a I");
            //     if (tc.do_UseLigatures () && index + 1 < len) {

            //         if (line.charAt (index + 1) == 'J') {

            //             pline.append ('\u0132'); /* IJ */
            //             ++index;

            //         } else pline.append (c);

            //     } else pline.append (c);

            //     break;

          case 'a':
          case 'A':
            /* if user requires ligatures and we’re not at the end of
             * the line and we haven’t found ‘ae’, ‘AE’, ‘Ae’ or ‘aE’ yet… */
            if (tc.do_UseLigatures() && index + 1 < len && !baePresent) {

              if (Character.toLowerCase(line.charAt(index + 1)) == 'e') {

                baePresent = true;
              }
            }
            /* Never make any change here. As we never know when ae
             * ligature is possible or not it is up to the user to do it */
            pline.append(c);

            break;

            /* NB: we cannot do the same for œ because it is too common,
             * we find it almost everywhere e.g., «does», «doesn’t» */

          case '\'':
            if (tc.do_UsePunctuation()) {

              pline.append('\u2019'); /* ’ */

            } else pline.append(c);

            break;

          case '.':
            if (tc.do_UsePunctuation()) {

              if (index + 2 < len) {
                  /* [ 0 .. (n-3) ] */

                if (line.charAt(index + 1) == '.') {

                  if (line.charAt(index + 2) == '.') {

                    pline.append('\u2026'); /* … */
                    index += 2;

                  } else {
                    /* case only two points, assume user wanted
                     * a (missing) third */
                    pline.append('\u2026'); /* … */
                    ++index;
                  }

                } else pline.append(c);

              } else if (index + 1 < len) {
                  /* [ (n-2) .. (n-1) ], (n-1) <=> EOL */

                if (line.charAt(index + 1) == '.') {
                  /* case only two points at end of line, assume
                   * user wanted a (missing) third */
                  pline.append('\u2026'); /* …  */
                  ++index;

                } else pline.append(c);

              } else pline.append(c);

            } else pline.append(c);

            break;

          case ';':
          case '?':
            /* case ':': /* temporarily deactivated extra spacing in front of a colon
             * ’cause it looks really bad in front of ‘:’ after name (HI subs) or when
             * displaying a time between hours and minutes */
            if (tc.do_UseFrenchSpacing() && index > 0)
              if (line.charAt(index - 1) != ' ')
                pline.append('\u2009'); /* U+2009 is a thin space */
            /* pline.append (' '); */
            /* U+0020 is a normal (ASCII) space */

            /* no change to the character itself */
            pline.append(c);
            break;

          case '!':
            if (tc.do_UseFrenchSpacing() && index > 0) {

              char prevc = line.charAt(index - 1);
              if (prevc != ' ' && prevc != '?') pline.append('\u2009'); /* U+2009 is a thin space */
              /* pline.append (' '); */
              /* U+0020 is a normal (ASCII) space */
            }
            /* no change to the character itself */
            pline.append(c);
            break;

          case 'f':
            if (tc.do_UseLigatures()) {

              if (index + 2 < len) {

                if (line.charAt(index + 1) == 'f') {
                  /* always check for ffi before fi */
                  switch (line.charAt(index + 2)) {

                      // case 't':
                      // pline.append ('\ue097'); /*  */
                      // index += 2;
                      // break;

                    case 'i':
                      pline.append('\ufb03'); /* ffi */
                      index += 2;
                      break;

                    case 'l':
                      pline.append('\ufb04'); /* ffl */
                      index += 2;
                      break;

                    default:
                      pline.append('\ufb00'); /* ff */
                      ++index;
                  }

                } else {

                  switch (line.charAt(index + 1)) {
                    case 'i':
                      pline.append('\ufb01'); /* fi */
                      ++index;
                      break;

                    case 'l':
                      pline.append('\ufb02'); /* fl */
                      ++index;
                      break;

                      // case 't':
                      //     pline.append ('\ufb05'); /* ſt */
                      //     ++index;
                      //     break;

                    default:
                      pline.append(c);
                  }
                }

              } else if (index + 1 < len) {
                  /* case two last chars of line */

                switch (line.charAt(index + 1)) {
                  case 'f':
                    pline.append('\ufb00'); /* ff */
                    ++index;
                    break;

                  case 'i':
                    pline.append('\ufb01'); /* fi */
                    ++index;
                    break;

                  case 'l':
                    pline.append('\ufb02'); /* fl */
                    ++index;
                    break;

                    // case 't':
                    //     pline.append ('\ufb05'); /* ſt */
                    //     ++index;
                    //     break;

                  default:
                    pline.append(c);
                }

              } else pline.append(c); /* case last char of line is ‘f’ */

            } else pline.append(c); /* case not using ligatures */

            break;

          case 'c':
            if (tc.do_UseOldLigatures()) {

              if (index + 1 < len) {

                switch (line.charAt(index + 1)) {
                  case 'h':
                    pline.append('\ue085'); /*  */
                    ++index;
                    break;

                  case 'k':
                    pline.append('\ue086'); /*  */
                    ++index;
                    break;

                  case 't':
                    pline.append('\ue087'); /*  */
                    ++index;
                    break;

                  default:
                    pline.append(c);
                }

              } else pline.append(c);

            } else pline.append(c);

            break;

          case 'T':
            if (tc.do_UseOldLigatures()) {

              if (index + 1 < len) {

                if (line.charAt(index + 1) == 'h') {

                  pline.append('\ue062'); /*  */
                  ++index;

                } else pline.append(c);

              } else pline.append(c);

            } else pline.append(c);

            break;

          case '"':
            /* if first char is a quote then make it a left quote
             * always, fix a bug when users write many left quotes and
             * only one right for a long quote that runs on several lines */
            if (tc.do_UsePunctuation()) {

              if (index == 0) {

                pline.append(ldquo);

              } else pline.append(bIsOpenQuote ? rdquo : ldquo);

            } else pline.append(c);

            bIsOpenQuote = (index == 0) ? true : (!bIsOpenQuote);

            break;

          case '<':
            bIsOpenTag = true;
            pline.append(c);
            break;

          case '-': /* TBD: replace '--' by en-dash U+2013 and '---' by em-dash  U+2014 */
            if (tc.do_UseTypoDash()) {

              if (index + 1 < len) {

                char nextc = line.charAt(index + 1);
                switch (nextc) {
                  case '-': /* or '\u002d' */
                    pline.append(tydash);
                    ++index;
                    break;

                  case '<': /* HTML tag, '\u003c' */
                  case '\u2009': /* Unicode thin space */
                  case ' ': /* ' ' or '\u0020' */
                    pline.append(tydash);
                    break;

                  default: /* between words: always a simple
                           dash unless it’s the first char */
                    pline.append(index == 0 ? tydash : c);
                }

              } else pline.append(tydash); /* case End Of Line */

            } else pline.append(c); /* case no typographic dash */

            break;

          case '`': /* TBD: replace `` by “ and '' by ” */
            if (tc.do_UsePunctuation()) {
              pline.append('\u2018'); /* ‘ */
            } else pline.append(c);
            break;

          case '0':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf730'); /*  */
            } else pline.append(c);
            break;

          case '1':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf731'); /*  */
            } else pline.append(c);
            break;

          case '2':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf732'); /*  */
            } else pline.append(c);
            break;

          case '3':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf733'); /*  */
            } else pline.append(c);
            break;

          case '4':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf734'); /*  */
            } else pline.append(c);
            break;

          case '5':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf735'); /*  */
            } else pline.append(c);
            break;

          case '6':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf736'); /*  */
            } else pline.append(c);
            break;

          case '7':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf737'); /*  */
            } else pline.append(c);
            break;

          case '8':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf738'); /*  */
            } else pline.append(c);
            break;

          case '9':
            if (tc.do_UseOldStyleNums()) {
              pline.append('\uf739'); /*  */
            } else pline.append(c);
            break;

          default:
            pline.append(c);
        }
      }
    }

    /* should probably be left commented out (unless debugging) */
    // lg.exiting (getClass ().getName (), String.format ("processLineSeq (%s)", pline));

    // return reference of modified string
    return new String(pline);
  }
コード例 #3
0
ファイル: TyPun.java プロジェクト: rbazaud/TyPun
  private void processList(final List<String> lst) {
    lg.entering(getClass().getName(), "processList");

    try {

      String firstline = lst.remove(0);

      if (firstline.startsWith(tc.get_BOM())) {

        lg.info("First line starts with a BOM (Byte Order Mark)");
        /* We don’t keep it since we don't really need a BOM with UTF-8 encoding */
        firstline = firstline.substring(1, firstline.length());
      }

      /* the first line is either empty or an index number */
      System.out.println(firstline);

      // we don’t need to keep a reference on the InputStreamReader as
      // every underlying streams will be closed as soon as br will be…
      // br = new BufferedReader (new InputStreamReader (fis, StandardCharsets.UTF_8));
      // while ((s = br.readLine ()) != null) {

      /* for each string of our list */
      for (String s : lst) {

        // check first that s is not an index number (regex 1 digit or
        // more) nor a timecode (contains « --> »)
        // if (! s.matches ("\\d+") && ! s.isEmpty ()) {
        // by using a slightly different regex i.e., 0 or more digit(s)
        // we use the same call to check for empty lines
        if (!s.matches("\\d*") && !s.contains(" --> ")) {

          // System.out.println ("Before => " + s);

          /* process each matching line sequentially */
          s = processLineSeq(s);

          // System.out.println ("After  => " + s);
        }
        /* write the line to stdout (whether it has been processed or
         * not doesn’t really matter here) */
        System.out.println(s);
        /* for debugging only, very verbose since it’s in the (tight) loop
         * + not very useful, the BufferedReader implementation seems
         * to be using a 8192 bytes buffer */
        /* lg.info (String.format ("%d remaining bytes to read…", fis.available ())); */
      }

    } catch (Throwable t) {

      lg.severe(t.getLocalizedMessage());
    } /* finally {

          if (br != null) {

              try { br.close (); } catch (Throwable t) {

                  lg.warning (t.getLocalizedMessage ());
              }
          }
      } */

    lg.exiting(getClass().getName(), "processList");
  }