Logo Search packages:      
Sourcecode: tcs version File versions  Download package

conv_jis.c

#ifdef      PLAN9
#include    <u.h>
#include    <libc.h>
#include    <bio.h>
#else
#include    <stdio.h>
#include    <unistd.h>
#include    "plan9.h"
#endif
#include    "hdr.h"
#include    "conv.h"
#include    "kuten208.h"
#include    "jis.h"

/*
      a state machine for interpreting all sorts of encodings
*/
static void
alljis(int c, Rune **r, long input_loc)
{
      static enum { state0, state1, state2, state3, state4 } state = state0;
      static int set8 = 0;
      static int japan646 = 0;
      static int lastc;
      int n;
      long l;

again:
      switch(state)
      {
      case state0:      /* idle state */
            if(c == ESC){ state = state1; return; }
            if(c < 0) return;
            if(!set8 && (c < 128)){
                  if(japan646){
                        switch(c)
                        {
                        case '\\':  emit(0xA5); return;     /* yen */
                        case '~':   emit(0xAF); return;     /* spacing macron */
                        default:    emit(c); return;
                        }
                  } else {
                        emit(c);
                        return;
                  }
            }
            if(c < 0x21){     /* guard against bogus characters in JIS mode */
                  if(squawk)
                        EPR "%s: non-JIS character %02x in %s near byte %d\n", argv0, c, file, input_loc);
                  emit(c);
                  return;
            }
            lastc = c; state = state4; return;

      case state1:      /* seen an escape */
            if(c == '$'){ state = state2; return; }
            if(c == '('){ state = state3; return; }
            emit(ESC); state = state0; goto again;

      case state2:      /* may be shifting into JIS */
            if((c == '@') || (c == 'B')){
                  set8 = 1; state = state0; return;
            }
            emit(ESC); emit('$'); state = state0; goto again;

      case state3:      /* may be shifting out of JIS */
            if((c == 'J') || (c == 'H') || (c == 'B')){
                  japan646 = (c == 'J');
                  set8 = 0; state = state0; return;
            }
            emit(ESC); emit('('); state = state0; goto again;

      case state4:      /* two part char */
            if(c < 0){
                  if(squawk)
                        EPR "%s: unexpected EOF in %s\n", argv0, file);
                  c = 0x21 | (lastc&0x80);
            }
            if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
                  emit(lastc);
                  state = state0;
                  goto again;
            }
            if(CANS2J(lastc, c)){   /* ms dos sjis */
                  int hi = lastc, lo = c;
                  S2J(hi, lo);                  /* convert to 208 */
                  n = hi*100 + lo - 3232;       /* convert to kuten208 */
            } else
                  n = (lastc&0x7F)*100 + (c&0x7f) - 3232;   /* kuten208 */
            if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
                  nerrors++;
                  if(squawk)
                        EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
                  if(!clean)
                        emit(BADMAP);
            } else {
                  if(l < 0){
                        l = -l;
                        if(squawk)
                              EPR "%s: ambiguous kuten208 %d (mapped to 0x%x) near byte %ld in %s\n", argv0, n, l, input_loc, file);
                  }
                  emit(l);
            }
            state = state0;
      }
}

/*
      a state machine for interpreting ms-kanji == shift-jis.
*/
static void
ms(int c, Rune **r, long input_loc)
{
      static enum { state0, state1, state2, state3, state4 } state = state0;
      static int set8 = 0;
      static int japan646 = 0;
      static int lastc;
      int n;
      long l;

again:
      switch(state)
      {
      case state0:      /* idle state */
            if(c == ESC){ state = state1; return; }
            if(c < 0) return;
            if(!set8 && (c < 128)){
                  if(japan646){
                        switch(c)
                        {
                        case '\\':  emit(0xA5); return;     /* yen */
                        case '~':   emit(0xAF); return;     /* spacing macron */
                        default:    emit(c); return;
                        }
                  } else {
                        emit(c);
                        return;
                  }
            }
            lastc = c; state = state4; return;

      case state1:      /* seen an escape */
            if(c == '$'){ state = state2; return; }
            if(c == '('){ state = state3; return; }
            emit(ESC); state = state0; goto again;

      case state2:      /* may be shifting into JIS */
            if((c == '@') || (c == 'B')){
                  set8 = 1; state = state0; return;
            }
            emit(ESC); emit('$'); state = state0; goto again;

      case state3:      /* may be shifting out of JIS */
            if((c == 'J') || (c == 'H') || (c == 'B')){
                  japan646 = (c == 'J');
                  set8 = 0; state = state0; return;
            }
            emit(ESC); emit('('); state = state0; goto again;

      case state4:      /* two part char */
            if(c < 0){
                  if(squawk)
                        EPR "%s: unexpected EOF in %s\n", argv0, file);
                  c = 0x21 | (lastc&0x80);
            }
            if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
                  emit(lastc);
                  state = state0;
                  goto again;
            }
            if(CANS2J(lastc, c)){   /* ms dos sjis */
                  int hi = lastc, lo = c;
                  S2J(hi, lo);                  /* convert to 208 */
                  n = hi*100 + lo - 3232;       /* convert to kuten208 */
            } else {
                  nerrors++;
                  if(squawk)
                        EPR "%s: illegal byte pair (0x%x,0x%x) near byte %ld in %s\n", argv0, lastc, c, input_loc, file);
                  if(!clean)
                        emit(BADMAP);
                  state = state0;
                  goto again;
            }
            if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
                  nerrors++;
                  if(squawk)
                        EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
      badchar:
                  if(!clean)
                        emit(BADMAP);
            } else {
                  if(l < 0){
                        l = -l;
                        if(squawk)
                              EPR "%s: ambiguous kuten208 %d (mapped to 0x%x) near byte %ld in %s\n", argv0, n, l, input_loc, file);
                  }
                  emit(l);
            }
            state = state0;
      }
}

/*
      a state machine for interpreting ujis == EUC
*/
static void
ujis(int c, Rune **r, long input_loc)
{
      static enum { state0, state1 } state = state0;
      static int lastc;
      int n;
      long l;

again:
      switch(state)
      {
      case state0:      /* idle state */
            if(c < 0) return;
            if(c < 128){
                  emit(c);
                  return;
            }
            if(c == 0x8e){    /* codeset 2 */
                  nerrors++;
                  if(squawk)
                        EPR "%s: unknown codeset 2 near byte %ld in %s\n", argv0, input_loc, file);
                  if(!clean)
                        emit(BADMAP);
                  return;
            }
            if(c == 0x8f){    /* codeset 3 */
                  nerrors++;
                  if(squawk)
                        EPR "%s: unknown codeset 3 near byte %ld in %s\n", argv0, input_loc, file);
                  if(!clean)
                        emit(BADMAP);
                  return;
            }
            lastc = c;
            state = state1;
            return;

      case state1:      /* two part char */
            if(c < 0){
                  if(squawk)
                        EPR "%s: unexpected EOF in %s\n", argv0, file);
                  c = 0xA1;
            }
            n = (lastc&0x7F)*100 + (c&0x7F) - 3232;   /* kuten208 */
            if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
                  nerrors++;
                  if(squawk)
                        EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
                  if(!clean)
                        emit(BADMAP);
            } else {
                  if(l < 0){
                        l = -l;
                        if(squawk)
                              EPR "%s: ambiguous kuten208 %d (mapped to 0x%x) near byte %ld in %s\n", argv0, n, l, input_loc, file);
                  }
                  emit(l);
            }
            state = state0;
      }
}

/*
      a state machine for interpreting jis-kanji == 2022-JP
*/
static void
jis(int c, Rune **r, long input_loc)
{
      static enum { state0, state1, state2, state3, state4 } state = state0;
      static int set8 = 0;
      static int japan646 = 0;
      static int lastc;
      int n;
      long l;

again:
      switch(state)
      {
      case state0:      /* idle state */
            if(c == ESC){ state = state1; return; }
            if(c < 0) return;
            if(!set8 && (c < 128)){
                  if(japan646){
                        switch(c)
                        {
                        case '\\':  emit(0xA5); return;     /* yen */
                        case '~':   emit(0xAF); return;     /* spacing macron */
                        default:    emit(c); return;
                        }
                  } else {
                        emit(c);
                        return;
                  }
            }
            lastc = c; state = state4; return;

      case state1:      /* seen an escape */
            if(c == '$'){ state = state2; return; }
            if(c == '('){ state = state3; return; }
            emit(ESC); state = state0; goto again;

      case state2:      /* may be shifting into JIS */
            if((c == '@') || (c == 'B')){
                  set8 = 1; state = state0; return;
            }
            emit(ESC); emit('$'); state = state0; goto again;

      case state3:      /* may be shifting out of JIS */
            if((c == 'J') || (c == 'H') || (c == 'B')){
                  japan646 = (c == 'J');
                  set8 = 0; state = state0; return;
            }
            emit(ESC); emit('('); state = state0; goto again;

      case state4:      /* two part char */
            if(c < 0){
                  if(squawk)
                        EPR "%s: unexpected EOF in %s\n", argv0, file);
                  c = 0x21 | (lastc&0x80);
            }
            if((lastc&0x80) != (c&0x80)){ /* guard against latin1 in jis */
                  emit(lastc);
                  state = state0;
                  goto again;
            }
            n = (lastc&0x7F)*100 + (c&0x7f) - 3232;   /* kuten208 */
            if((n >= KUTEN208MAX) || ((l = tabkuten208[n]) == -1)){
                  nerrors++;
                  if(squawk)
                        EPR "%s: unknown kuten208 %d (from 0x%x,0x%x) near byte %ld in %s\n", argv0, n, lastc, c, input_loc, file);
                  if(!clean)
                        emit(BADMAP);
            } else {
                  if(l < 0){
                        l = -l;
                        if(squawk)
                              EPR "%s: ambiguous kuten208 %d (mapped to 0x%x) near byte %ld in %s\n", argv0, n, l, input_loc, file);
                  }
                  emit(l);
            }
            state = state0;
      }
}

static void
do_in(int fd, void (*procfn)(int, Rune **, long), struct convert *out)
{
      Rune ob[N];
      Rune *r, *re;
      uchar ibuf[N];
      int n, i;
      long nin;

      r = ob;
      re = ob+N-3;
      nin = 0;
      while((n = read(fd, ibuf, sizeof ibuf)) > 0){
            for(i = 0; i < n; i++){
                  (*procfn)(ibuf[i], &r, nin++);
                  if(r >= re){
                        OUT(out, ob, r-ob);
                        r = ob;
                  }
            }
            if(r > ob){
                  OUT(out, ob, r-ob);
                  r = ob;
            }
      }
      (*procfn)(-1, &r, nin);
      if(r > ob)
            OUT(out, ob, r-ob);
}

void
jis_in(int fd, long *notused, struct convert *out)
{
      USED(notused);
      do_in(fd, alljis, out);
}

void
ujis_in(int fd, long *notused, struct convert *out)
{
      USED(notused);
      do_in(fd, ujis, out);
}

void
msjis_in(int fd, long *notused, struct convert *out)
{
      USED(notused);
      do_in(fd, ms, out);
}

void
jisjis_in(int fd, long *notused, struct convert *out)
{
      USED(notused);
      do_in(fd, jis, out);
}

static int first = 1;

static void
tab_init(void)
{
      int i;
      long l;

      first = 0;
      for(i = 0; i < NRUNE; i++)
            tab[i] = -1;
      for(i = 0; i < KUTEN208MAX; i++)
            if((l = tabkuten208[i]) != -1){
                  if(l < 0)
                        tab[-l] = i;
                  else
                        tab[l] = i;
            }
}


/*    jis-kanji, or ISO 2022-JP     */
void
jisjis_out(Rune *base, int n, long *notused)
{
      char *p;
      int i;
      Rune r;
      static enum { ascii, japan646, jp2022 } state = ascii;

      USED(notused);
      if(first)
            tab_init();
      nrunes += n;
      p = obuf;
      for(i = 0; i < n; i++){
            r = base[i];
            if(r < 128){
                  if(state == jp2022){
                        *p++ = ESC; *p++ = '('; *p++ = 'H';
                        state = ascii;
                  }
                  *p++ = r;
            } else {
                  if(tab[r] != -1){
                        if(state != jp2022){
                              *p++ = ESC; *p++ = '$'; *p++ = 'B';
                              state = jp2022;
                        }
                        *p++ = tab[r]/100 + ' ';
                        *p++ = tab[r]%100 + ' ';
                        continue;
                  }
                  if(squawk)
                        EPR "%s: rune 0x%x not in output cs\n", argv0, r);
                  nerrors++;
                  if(clean)
                        continue;
                  *p++ = BYTEBADMAP;
            }
      }
      noutput += p-obuf;
      if(p > obuf)
            write(1, obuf, p-obuf);
}

/*    ms-kanji, or Shift-JIS  */
void
msjis_out(Rune *base, int n, long *notused)
{
      char *p;
      int i, hi, lo;
      Rune r;

      USED(notused);
      if(first)
            tab_init();
      nrunes += n;
      p = obuf;
      for(i = 0; i < n; i++){
            r = base[i];
            if(r < 128)
                  *p++ = r;
            else {
                  if(tab[r] != -1){
                        hi = tab[r]/100 + ' ';
                        lo = tab[r]%100 + ' ';
                        J2S(hi, lo);
                        *p++ = hi;
                        *p++ = lo;
                        continue;
                  }
                  if(squawk)
                        EPR "%s: rune 0x%x not in output cs\n", argv0, r);
                  nerrors++;
                  if(clean)
                        continue;
                  *p++ = BYTEBADMAP;
            }
      }
      noutput += p-obuf;
      if(p > obuf)
            write(1, obuf, p-obuf);
}

/*    ujis, or EUC      */
void
ujis_out(Rune *base, int n, long *notused)
{
      char *p;
      int i;
      Rune r;

      USED(notused);
      if(first)
            tab_init();
      nrunes += n;
      p = obuf;
      for(i = 0; i < n; i++){
            r = base[i];
            if(r < 128)
                  *p++ = r;
            else {
                  if(tab[r] != -1){
                        *p++ = 0x80 | (tab[r]/100 + ' ');
                        *p++ = 0x80 | (tab[r]%100 + ' ');
                        continue;
                  }
                  if(squawk)
                        EPR "%s: rune 0x%x not in output cs\n", argv0, r);
                  nerrors++;
                  if(clean)
                        continue;
                  *p++ = BYTEBADMAP;
            }
      }
      noutput += p-obuf;
      if(p > obuf)
            write(1, obuf, p-obuf);
}

Generated by  Doxygen 1.6.0   Back to index