28日目: 正規表現に対応する

string, xstring, symbolに取り組んできたので、今回は正規表現リテラルに取り組みます。

parserを変更する

xstring, symbolなどと同じように正規表現もまた必要に応じてNODE_DSTRをつくり、最後にNODE_REGXやNODE_DREGXに変換するという実装になっています。

regexp         : tREGEXP_BEG regexp_contents tREGEXP_END
                   {
                       $$ = new_regexp(p, $2, $3, &@$, &@1, &@2, &@3);
                   /*% ripper: regexp_literal!($:2, $:3) %*/
                   }
               ;

regexp_contents: /* none */
                    {
                        $$ = 0;
                    /*% ripper: regexp_new! %*/
                    }
                | regexp_contents string_content
                    {
                        NODE *head = $1, *tail = $2;
                        if (!head) {
                            $$ = tail;
                        }
                        else if (!tail) {
                            $$ = head;
                        }
                        else {
                            switch (nd_type(head)) {
                              case NODE_STR:
                                head = str2dstr(p, head);
                                break;
                              case NODE_DSTR:
                                break;
                              default:
                                head = list_append(p, NEW_DSTR(0, &@$), head);
                                break;
                            }
                            $$ = list_append(p, head, tail);
                        }
                    /*% ripper: regexp_add!($:1, $:2) %*/
                    }
                ;

ノードの書き換え後はそれぞれRegularExpressionNodeとInterpolatedRegularExpressionNodeになりますが構造は変わりません。 regexp_contentsではliteral_concat関数を呼び出すようにし、new_regexp関数も書き換え後のノードに対応するよう書き換えます。

regexp          : tREGEXP_BEG regexp_contents tREGEXP_END
                    {
                        $$ = new_regexp(p, $2, $3, &@$, &@1, &@2, &@3);
                    /*% ripper: regexp_literal!($:2, $:3) %*/
                    }
                ;


regexp_contents: /* none */
                    {
                        $$ = 0;
                    /*% ripper: regexp_new! %*/
                    }
                | regexp_contents string_content
                    {
                        $$ = literal_concat(p, $1, $2, &@$);
                    /*% ripper: regexp_add!($:1, $:2) %*/
                    }
                ;


static rb_node_t *
new_regexp(struct parser_params *p, rb_node_t *node, int options, const YYLTYPE *loc, const YYLTYPE *opening_loc, const YYLTYPE *content_loc, const YYLTYPE *closing_loc)
{
    if (!node) {
        /* Check string is valid regex */
        rb_parser_string_t *str = STRING_NEW0();
        reg_compile(p, str, options);
        node = NEW_RB_REGULAR_EXPRESSION(str, options, loc, opening_loc, content_loc, closing_loc);
        return node;
    }
    switch (nd_type(node)) {
      case RB_STRING_NODE:
        {
            /* Check string is valid regex */
            reg_compile(p, RB_NODE_STRING(node)->unescaped, options);
            node = str2regx(p, node, options, loc, opening_loc, content_loc, closing_loc);
        }
        break;
      default:
        node = NEW_RB_INTERPOLATED_STRING0(node, loc);
        /* fall through */
      case RB_INTERPOLATED_STRING_NODE:
        rb_nd_set_type(node, RB_INTERPOLATED_REGULAR_EXPRESSION_NODE);
        rb_nd_set_loc(node, loc);
        rb_interpolated_regular_expression_node_t *const dreg = RB_NODE_INTERPOLATED_REGULAR_EXPRESSION(node);
        // dreg->as.nd_cflag = options & RE_OPTION_MASK;
        dregex_fragment_setenc(p, dreg, options);
        if (options & RE_OPTION_ONCE) {
            node = NEW_ONCE(node, loc);
        }
        break;
    }
    return node;
}

コンパイルする

NODE_REGXの場合はノードにある文字列から正規表現オブジェクトを生成してputobjectします。またNODE_DREGXの場合はcompile_dregx関数を呼び出します。 compile_dregx関数はというと、ノードの持っているNODE_STRやNODE_DSTRから文字列を生成したあとにtoregexp命令を足すことで正規表現オブジェクトを生成します。

case NODE_REGX:{
  if (!popped) {
  VALUE lit = rb_node_regx_string_val(node);
  RB_OBJ_SET_SHAREABLE(lit);
  ADD_INSN1(ret, node, putobject, lit);
  RB_OBJ_WRITTEN(iseq, Qundef, lit);
  }
  break;
}
case NODE_DREGX:
  compile_dregx(iseq, ret, node, popped);
  break;

書き換え後のノードにあうようにこれらを修正しておきます。

case RB_REGULAR_EXPRESSION_NODE: {
  if (!popped) {
      VALUE lit = rb_node_regx_string_val2(node);
      RB_OBJ_SET_SHAREABLE(lit);
      ADD_INSN1(ret, node, putobject, lit);
      RB_OBJ_WRITTEN(iseq, Qundef, lit);
  }
  break;
}
case RB_INTERPOLATED_REGULAR_EXPRESSION_NODE: {
  compile_dregx(iseq, ret, node, popped);
  break;
}


static int
compile_dregx(rb_iseq_t *iseq, LINK_ANCHOR *const ret, const NODE *const node, int popped)
{
    int cnt;
    int cflag = 0;

    CHECK(compile_dstr_fragments(iseq, ret, node, &cnt, TRUE));
    ADD_INSN2(ret, node, toregexp, INT2FIX(cflag), INT2FIX(cnt));

    if (popped) {
        ADD_INSN(ret, node, pop);
    }

    return COMPILE_OK;
}

それぞれ実行してみるとちゃんと正規表現オブジェクトが生成されています。

var = "b"

p /abc/
#=> /abc/
p /a#{var}c/
#=> /abc/

正規表現のoption

正規表現リテラルは文字列リテラルやsymbolリテラルと異なりオプションを記述することができます。

/abc/e.encoding
#=> #<Encoding:EUC-JP>

/abc/.match? "ABC"
#=> false
/abc/i.match? "ABC"
#=> true

このオプションはtREGEXP_ENDトークンのsemantic valueになっています。

regexp : tREGEXP_BEG regexp_contents tREGEXP_END

lexerの実装をみるとregx_options関数でオプションの値を計算していることがわかります。

static inline enum yytokentype
parser_string_term(struct parser_params *p, int func)
{
    xfree(p->lex.strterm);
    p->lex.strterm = 0;
    if (func & STR_FUNC_REGEXP) {
        set_yylval_num(regx_options(p));
        dispatch_scan_event(p, tREGEXP_END);
        SET_LEX_STATE(EXPR_END);
        return tREGEXP_END;
    }
...

regx_options関数と関連するいくつかの関数の実装を一応のせておきます。

static int
regx_options(struct parser_params *p)
{
    int kcode = 0;
    int kopt = 0;
    int options = 0;
    int c, opt, kc;

    newtok(p);
    while (c = nextc(p), ISALPHA(c)) {
        if (c == 'o') {
            options |= RE_OPTION_ONCE;
        }
        else if (char_to_option_kcode(c, &opt, &kc)) {
            if (kc >= 0) {
                if (kc != ENC_ASCII8BIT) kcode = c;
                kopt = opt;
            }
            else {
                options |= opt;
            }
        }
        else {
            tokadd(p, c);
        }
    }
    options |= kopt;
    pushback(p, c);
    if (toklen(p)) {
        YYLTYPE loc = RUBY_INIT_YYLLOC();
        tokfix(p);
        compile_error(p, "unknown regexp option%s - %*s",
                      toklen(p) > 1 ? "s" : "", toklen(p), tok(p));
        parser_show_error_line(p, &loc);
    }
    return options | RE_OPTION_ENCODING(kcode);
}

static int
char_to_option_kcode(int c, int *option, int *kcode)
{
    *option = 0;

    switch (c) {
      case 'n':
        *kcode = ENC_ASCII8BIT;
        return (*option = ARG_ENCODING_NONE);
      case 'e':
        *kcode = ENC_EUC_JP;
        break;
      case 's':
        *kcode = ENC_Windows_31J;
        break;
      case 'u':
        *kcode = ENC_UTF8;
        break;
      default:
        *kcode = -1;
        return (*option = char_to_option(c));
    }
    *option = ARG_ENCODING_FIXED;
    return 1;
}

static int
char_to_option(int c)
{
    int val;

    switch (c) {
      case 'i':
        val = RE_ONIG_OPTION_IGNORECASE;
        break;
      case 'x':
        val = RE_ONIG_OPTION_EXTEND;
        break;
      case 'm':
        val = RE_ONIG_OPTION_MULTILINE;
        break;
      default:
        val = 0;
        break;
    }
    return val;
}

これをみてパッとわかるなら問題ないので以下はスキップいいのですが、初見だとなかなか頭に入らないかもしれません。そこでまずは正規表現リテラルに指定可能なオプションを確認しておきましょう。

i: Ignorecase
x: eXtended
m: Multiline
n: Noencoding
e: Euc-jp
s: Windows-31J¹
u: Utf-8
o: Once

regx_options関数はこれらのオプションたちをintとして返します。ということはbitそれぞれにオプションを割り当てているのでしょう。

'i' RE_ONIG_OPTION_IGNORECASE                 1
'x' RE_ONIG_OPTION_EXTEND                    10
'm' RE_ONIG_OPTION_MULTILINE                100
    ARG_ENCODING_FIXED                    10000
    RE_OPTION_ARG_ENCODING_NONE          100000
    encoding                   xxxxxxxx00000000
'o' RE_OPTION_ONCE            10000000000000000

未使用のビットも含めて下位8ビットが正規表現に関するオプション、その上の8ビットがエンコーディングに関するビットで、その上の1ビットがoオプションを表しているようです。エンコーディングについて2点補足しておきます。

エンコーディングを表す8ビットには文字が入っている²
nオプションのときはエンコーディングに関するビットは立たず、代わりにRE_OPTION_ARG_ENCODING_NONE(ARG_ENCODING_NONE)のビットが立っている

以上を踏まえてparse.yではint optionからrb_regular_expression_flags_tに変換してノードのflagに書き込み、compileのときにflagをint optionに戻してrb_reg_compile関数に渡すようにします。

// parse
static rb_regular_expression_flags_t
regx_option_flags(int options)
{
    rb_regular_expression_flags_t flags = 0;
    int enc = RE_OPTION_ENCODING_IDX(options);

    switch (enc) {
      case 'e':
       flags |= RB_REGULAR_EXPRESSION_FLAGS_EUC_JP;
       break;
      case 's':
       flags |= RB_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J;
       break;
      case 'u':
       flags |= RB_REGULAR_EXPRESSION_FLAGS_UTF_8;
       break;
    }
    if (options & ARG_ENCODING_NONE) flags |= RB_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT;
    if (options & RE_ONIG_OPTION_IGNORECASE) flags |= RB_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE;
    if (options & RE_ONIG_OPTION_EXTEND) flags |= RB_REGULAR_EXPRESSION_FLAGS_EXTENDED;
    if (options & RE_ONIG_OPTION_MULTILINE) flags |= RB_REGULAR_EXPRESSION_FLAGS_MULTI_LINE;

    return flags;
}

// compile
static int
node_regx_options(rb_node_flags_t flags)
{
    int options = 0;
    int kcode = 0;

    if (flags & RB_REGULAR_EXPRESSION_FLAGS_EUC_JP) {
        kcode = 'e';
        options |= ARG_ENCODING_FIXED;
    }
    else if (flags & RB_REGULAR_EXPRESSION_FLAGS_ASCII_8BIT) {
        options |= ARG_ENCODING_NONE;
    }
    else if (flags & RB_REGULAR_EXPRESSION_FLAGS_WINDOWS_31J) {
        kcode = 's';
        options |= ARG_ENCODING_FIXED;
    }
    else if (flags & RB_REGULAR_EXPRESSION_FLAGS_UTF_8) {
        kcode = 'u';
        options |= ARG_ENCODING_FIXED;
    }

    if (flags & RB_REGULAR_EXPRESSION_FLAGS_IGNORE_CASE) options |= RE_ONIG_OPTION_IGNORECASE;
    if (flags & RB_REGULAR_EXPRESSION_FLAGS_EXTENDED) options |= RE_ONIG_OPTION_EXTEND;
    if (flags & RB_REGULAR_EXPRESSION_FLAGS_MULTI_LINE) options |= RE_ONIG_OPTION_MULTILINE;

    return options | RE_OPTION_ENCODING(kcode);
}

それぞれ呼び出し元でfprintfを使って、変換前の値と復元後の値を表示してみます。

p //
p //ixm
#=> new_regexp: 0
#=> new_regexp: 7
#=> node_regx_options: 0
#=> node_regx_options: 7
#=> //
#=> //mix

p //n.encoding
p //e.encoding
p //s.encoding
p //u.encoding
#=> new_regexp: 32
#=> new_regexp: 25872
#=> ../../test.rb: warning: failed to load encoding (EUC-JP); use ASCII-8BIT instead
#=> new_regexp: 29456
#=> ../../test.rb: warning: failed to load encoding (Windows-31J); use ASCII-8BIT instead
#=> new_regexp: 29968
#=> node_regx_options: 32
#=> node_regx_options: 25872
#=> node_regx_options: 29456
#=> node_regx_options: 29968
#=> #<Encoding:US-ASCII>
#=> #<Encoding:BINARY (ASCII-8BIT)>
#=> #<Encoding:BINARY (ASCII-8BIT)>
#=> #<Encoding:UTF-8>

minirubyなので一部のencodingが使えませんが、optionの値がもとの値と一致していることが確認できました。

ONCE

正規表現リテラルのoオプションは特別な対応が必要です。

def m
  p :m
  /#{p :regex}/
end

def m2
  p :m2
  /#{p :regex2}/o
end

m
#=> :m
#=> :regex
m
#=> :m
#=> :regex

m2
#=> :m2
#=> :regex2
m2
#=> :m2

oオプションはその正規表現の式展開を一度だけ行うようにするオプションです。そのためmメソッドは呼ばれる都度p :regexが実行されるのに対して、m2メソッドの場合は二回目以降の呼び出しでp :regex2が実行されません。

ノードの書き換え前はNODE_ONCEという専用のノードの下にNODE_DREGXが置かれます。

# @ NODE_ONCE (id: 6, line: 1, location: (1,0)-(1,11))*
# +- nd_body:
#     @ NODE_DREGX (id: 0, line: 1, location: (1,0)-(1,11))
#     +- string: "a"
#     +- nd_next->nd_head:
#     |   @ NODE_EVSTR (id: 2, line: 1, location: (1,2)-(1,8))
#     |   +- nd_body:
#     |   |   @ NODE_VCALL (id: 1, line: 1, location: (1,4)-(1,7))
#     |   |   +- nd_mid: :var
#     |   +- opening_loc: (1,2)-(1,4)
#     |   +- closing_loc: (1,7)-(1,8)
#     +- nd_next->nd_next:
#         @ NODE_LIST (id: 5, line: 1, location: (1,8)-(1,9))
#         +- as.nd_alen: 1
#         +- nd_head:
#         |   @ NODE_STR (id: 4, line: 1, location: (1,8)-(1,9))
#         |   +- string: "c"
#         +- nd_next:
#             (null node)
/a#{var}c/o

書き換え後は専用のノードは存在せず、RegularExpressionFlagsで表現するようになります。

# +-- @ InterpolatedRegularExpressionNode (location: (1,0)-(1,11))*
#     +-- RegularExpressionFlags: once
/a#{var}c/o

ONCEをコンパイルする

ノードの書き換え前のバイトコードをみてみましょう。

# == disasm: #<ISeq:<main>@test.rb:1 (1,0)-(1,11)>
# 0000 once                                   block in <main>, <is:0>   (   1)[Li]
# 0003 leave

# == disasm: #<ISeq:block in <main>@test.rb:1 (1,0)-(1,11)>
# 0000 putobject                              "a"                       (   1)
# 0002 putself
# 0003 opt_send_without_block                 <calldata!mid:var, argc:0, FCALL|VCALL|ARGS_SIMPLE>
# 0005 dup
# 0006 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0008 anytostring
# 0009 putobject                              "c"
# 0011 toregexp                               0, 3
# 0014 leave
/a#{var}c/o

2つのISeqが生成され、正規表現オブジェクトを生成する部分はonceのオペランドになっていることがわかります。一度しか実行しないということは、一度実行した結果をどこかに保存しておき、次のタイミングではその結果を取り出して使うという実装になっているはずです。この一度実行した結果を保存する場所というのがISeqのISE(Inline Storage Entry)という場所で、once命令の第二オペランドである<is:0>はこのISEを示しています。

typedef union iseq_inline_storage_entry *ISE;

union iseq_inline_storage_entry {
    struct {
        struct rb_thread_struct *running_thread;
        VALUE value;
    } once;
    struct iseq_inline_constant_cache ic_cache;
    struct iseq_inline_iv_cache_entry iv_cache;
};

NODE_ONCEをコンパイルする箇所をみてみると、bodyを別のISeqとしてコンパイルしてonce命令を生成してることがわかります。

case NODE_ONCE:{
  int ic_index = body->ise_size++;
  const rb_iseq_t *block_iseq;
  block_iseq = NEW_CHILD_ISEQ(RNODE_ONCE(node)->nd_body, make_name_for_block(iseq), ISEQ_TYPE_PLAIN, line);

  ADD_INSN2(ret, node, once, block_iseq, INT2FIX(ic_index));
  RB_OBJ_WRITTEN(iseq, Qundef, (VALUE)block_iseq);

  if (popped) {
      ADD_INSN(ret, node, pop);
  }
  break;
}

またVM側のonce命令に相当するvm_once_dispatch関数をみてみると、running_threadの値をチェックしてiseqを評価もしくはvalueを返すようになっていることがわかります。

static VALUE
vm_once_dispatch(rb_execution_context_t *ec, ISEQ iseq, ISE is)
{
    rb_thread_t *th = rb_ec_thread_ptr(ec);
    rb_thread_t *const RUNNING_THREAD_ONCE_DONE = (rb_thread_t *)(0x1);

  again:
    if (is->once.running_thread == RUNNING_THREAD_ONCE_DONE) {
        return is->once.value;
    }
    else if (is->once.running_thread == NULL) {
        VALUE val;
        is->once.running_thread = th;
        val = rb_ensure(vm_once_exec, (VALUE)iseq, vm_once_clear, (VALUE)is);
        // TODO: confirm that it is shareable

        if (RB_FL_ABLE(val)) {
            RB_OBJ_SET_SHAREABLE(val);
        }

        RB_OBJ_WRITE(ec->cfp->iseq, &is->once.value, val);

        /* is->once.running_thread is cleared by vm_once_clear() */
        is->once.running_thread = RUNNING_THREAD_ONCE_DONE; /* success */
        return val;
    }
    else if (is->once.running_thread == th) {
        /* recursive once */
        return vm_once_exec((VALUE)iseq);
    }
    else {
        /* waiting for finish */
        RUBY_VM_CHECK_INTS(ec);
        rb_thread_schedule();
        goto again;
    }
}

once命令の生成時はINT2FIX(ic_index)であった第二オペランドがvm_once_dispatch関数の時点ではISE isになっているのが気になるかもしれませんが、ISeqを生成する途中でintからポインタへの変換をしているので、実際の第二オペランドはISEになっています。

                      case TS_ISE: /* inline storage entry: `once` insn */
                      case TS_ICVARC: /* inline cvar cache */
                        {
                            unsigned int ic_index = FIX2UINT(operands[j]);
                            IC ic = &ISEQ_IS_ENTRY_START(body, type)[ic_index].ic_cache;
                            if (UNLIKELY(ic_index >= ISEQ_IS_SIZE(body))) {
                                BADINSN_DUMP(anchor, &iobj->link, 0);
                                COMPILE_ERROR(iseq, iobj->insn_info.line_no,
                                              "iseq_set_sequence: ic_index overflow: index: %d, size: %d",
                                              ic_index, ISEQ_IS_SIZE(body));
                            }
                            generated_iseq[code_index + 1 + j] = (VALUE)ic;

                            break;
                        }

NODE_ONCEはなくなったのでInterpolatedRegularExpressionNodeのコンパイル時にフラグをみて処理を分岐するようにしましょう。

      case RB_INTERPOLATED_REGULAR_EXPRESSION_NODE: {
        if (rb_node_get_fl(node) & RB_REGULAR_EXPRESSION_FLAGS_ONCE) {
            int ic_index = body->ise_size++;
            const rb_iseq_t *block_iseq;
            block_iseq = NEW_CHILD_ISEQ(node, make_name_for_block(iseq), ISEQ_TYPE_PLAIN, line);

            ADD_INSN2(ret, node, once, block_iseq, INT2FIX(ic_index));
            RB_OBJ_WRITTEN(iseq, Qundef, (VALUE)block_iseq);

            if (popped) {
                ADD_INSN(ret, node, pop);
            }
            break;
        }
        else {
            compile_dregx(iseq, ret, node, popped);
            break;
        }
      }

この状態でバイトコードを生成してみるとスタックを食い潰してエラーで落ちます。

/a#{var}c/o
#=> stack level too deep (SystemStackError)

これまではNEW_CHILD_ISEQ(RNODE_ONCE(node)->nd_body, ...)と1つノードを潰していたところが、今回の変更でNEW_CHILD_ISEQ(node, ...)に変わったのでいつまでも同じInterpolatedRegularExpressionNodeをコンパイルするようになり、ひたすら関数呼び出しを続けてスタックを食い潰すようになってしまいました。

ISEQ_TYPE_PLAINはこのためにしか使われていないことを踏まえてrb_iseq_compile_node関数で呼び出す関数を変更します。

@@ -1141,7 +1142,8 @@ rb_iseq_compile_node(rb_iseq_t *iseq, const NODE *node)
             CHECK(COMPILE_POPPED(ret, "ensure", node));
             break;
           case ISEQ_TYPE_PLAIN:
-            CHECK(COMPILE(ret, "ensure", node));
+            EXPECT_NODE("ISEQ_TYPE_PLAIN", node, RB_INTERPOLATED_REGULAR_EXPRESSION_NODE, COMPILE_NG);
+            CHECK(compile_dregx(iseq, ret, node, false));
             break;

修正後はちゃんとバイトコードが生成されるようになりました。

# == disasm: #<ISeq:<main>@../../test.rb:1 (1,0)-(1,11)>
# 0000 once                                   block in <main>, <is:0>   (   1)[Li]
# 0003 leave

# == disasm: #<ISeq:block in <main>@../../test.rb:1 (1,0)-(1,11)>
# 0000 putobject                              "a"                       (   1)
# 0002 putself                                [Li]
# 0003 opt_send_without_block                 <calldata!mid:var, argc:0, FCALL|VCALL|ARGS_SIMPLE>
# 0005 dup
# 0006 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0008 anytostring
# 0009 putobject                              "c"
# 0011 toregexp                               0, 3
# 0014 leave
/a#{var}c/o

まとめ

今日の成果です。

正規表現に対応した

Shift_JISのsかと思ったんですが、文字コード周りで迂闊なことを言いたくないので黙っています。↩
あれ、Rubyってcharは8bitって前提をおいていたっけとか、'n'って常に8bitで表現できたっけとか一瞬気になるところですが、ここでは見なかったことにします。C言語力が低い!↩