26日目: 文字列に関するいろいろ

前回は文字リテラル、文字列リテラル、式展開などの対応をしました。しかし文字列に関連するトピックはこれだけではありません。今回は%記法、文字列の結合、ヒアドキュメントなどをみていきます。

%記法

Rubyには%記法というシンタックスがあり、その一部は文字列を生成します。

var = "a"

%!#{var}!
#=> "a"
%Q[#{var}]
#=> "a"
%q[#{var}]
#=> "\#{var}"

これらはparse.yのどのルールに該当するのでしょうか。たとえば以下のようなコードを用意して-yオプションを指定してrubyを実行してみます。

var = "a"

%!#{var}!

位置情報を見ながら生成されるトークンや適用される生成規則を確認すると、%!は"string literal"、!は"terminator"というトークンになっていて、全体はstring1になっていることがわかります。

Next token is token "string literal" (3.0-3.2: )
...
Reducing stack by rule 639 (line 5936):
   $1 = token "string literal" (3.0-3.2: )
   $2 = nterm string_contents (3.2-3.8: NODE_EVSTR)
   $3 = token "terminator" (3.8-3.9: )
-> $$ = nterm string1 (3.0-3.9: NODE_EVSTR)

念のためトークンの定義と生成規則をみておくと、これは前回扱った"str"や"#{var}"と同じであることがわかります。

%token tSTRING_BEG   "string literal"
%token tSTRING_END  "terminator"

string1     : tSTRING_BEG string_contents tSTRING_END

ということは特別なにかしなくても動くのでは...?

var = "a"

# +-- @ InterpolatedStringNode (location: (0,3)-(9,8))*
# |   +-- InterpolatedStringNodeFlags: nil
# |   +-- opening_loc: nil
# |   +-- parts: (length: 1)
# |   |   +-- @ EmbeddedStatementsNode (location: (0,3)-(9,8))
# |   |       +-- opening_loc: (3,2)-(3,4) = ""
# |   |       +-- statements:
# |   |       |   @ StatementsNode (location: (3,4)-(3,7))
# |   |       |   +-- body: (length: 1)
# |   |       |       +-- @ LocalVariableReadNode (location: (3,4)-(3,7))*
# |   |       |           +-- name: :var
# |   |       |           +-- depth: 0
# |   |       +-- closing_loc: (3,7)-(3,8) = ""
# |   +-- closing_loc: nil
#
# 0004 getlocal_WC_0                          var@0                     (   3)[Li]
# 0006 dup
# 0007 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0009 anytostring
# 0010 pop
%!#{var}!

# +-- @ InterpolatedStringNode (location: (0,4)-(10,9))*
# |   +-- InterpolatedStringNodeFlags: nil
# |   +-- opening_loc: nil
# |   +-- parts: (length: 1)
# |   |   +-- @ EmbeddedStatementsNode (location: (0,4)-(10,9))
# |   |       +-- opening_loc: (4,3)-(4,5) = ""
# |   |       +-- statements:
# |   |       |   @ StatementsNode (location: (4,5)-(4,8))
# |   |       |   +-- body: (length: 1)
# |   |       |       +-- @ LocalVariableReadNode (location: (4,5)-(4,8))*
# |   |       |           +-- name: :var
# |   |       |           +-- depth: 0
# |   |       +-- closing_loc: (4,8)-(4,9) = ""
# |   +-- closing_loc: nil
#
# 0011 getlocal_WC_0                          var@0                     (   4)[Li]
# 0013 dup
# 0014 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0016 anytostring
# 0017 pop
%Q[#{var}]

# +-- @ StringNode (location: (0,5)-(10,9))*
#     +-- StringFlags: nil
#     +-- opening_loc: nil
#     +-- content_loc: (0,-1)-(0,-1) = ""
#     +-- closing_loc: nil
#     +-- unescaped: "\#{var}"
#
# 0018 putchilledstring                       "\#{var}"                 (   5)[Li]
# 0020 leave
%q[#{var}]

よさそうです。

文字列の結合

Rubyでは文字列の並びは1つの文字列に結合されます。

var = "str"

"aaa" "bbb"
#=> "aaabbb"
"aaa" "bbb" "#{var}" "ccc" "ddd"
#=> "aaabbbstrcccddd"

これについてはノードの書き換え前後で細かい部分が変わります。書き換え前は"aaa"と"bbb"を結合して、1つのNODE_STRにしていましたが、書き換え後は2つのStringNodeがInterpolatedStringNodeにぶら下がることになります。

"aaa" "bbb"

# Before
#
# @ NODE_STR (id: 0, line: 1, location: (1,0)-(1,5))*
# +- string: "aaabbb"

# After
#
# @ InterpolatedStringNode (location: (1,0)-(1,11))
# +-- InterpolatedStringNodeFlags: nil
# +-- opening_loc: nil
# +-- parts: (length: 2)
# |   +-- @ StringNode (location: (1,0)-(1,5))
# |   |   +-- StringFlags: frozen
# |   |   +-- opening_loc: (1,0)-(1,1) = "\""
# |   |   +-- content_loc: (1,1)-(1,4) = "aaa"
# |   |   +-- closing_loc: (1,4)-(1,5) = "\""
# |   |   +-- unescaped: "aaa"
# |   +-- @ StringNode (location: (1,6)-(1,11))
# |       +-- StringFlags: frozen
# |       +-- opening_loc: (1,6)-(1,7) = "\""
# |       +-- content_loc: (1,7)-(1,10) = "bbb"
# |       +-- closing_loc: (1,10)-(1,11) = "\""
# |       +-- unescaped: "bbb"
# +-- closing_loc: nil

実はこの部分も前回の実装でカバーされています。というのも"aaa#{var}"のaaaと#{var}をまとめる部分と、"aaa" "bbb"の"aaa"と"bbb"をまとめる部分はどちらもliteral_concat関数を呼んでいるので、必要に応じてInterpolatedStringNodeを作成、もしくはInterpolatedStringNodeへの追加を行なってくれます。

string          : tCHAR
                | string1
                // `"aaa" "bbb"`の`"aaa"`と`"bbb"`をまとめる部分
                | string string1
                    {
                        $$ = literal_concat(p, $1, $2, &@$);
                    /*% ripper: string_concat!($:1, $:2) %*/
                    }
                ;

// `"aaa#{var}"`の`aaa`と`#{var}`をまとめる部分
string_contents : /* none */
                    {
                        $$ = 0;
                    /*% ripper: string_content! %*/
                    }
                | string_contents string_content
                    {
                        $$ = literal_concat(p, $1, $2, &@$);
                    /*% ripper: string_add!($:1, $:2) %*/
                    }
                ;

static rb_node_t *
literal_concat(struct parser_params *p, rb_node_t *head, rb_node_t *tail, const YYLTYPE *loc)
{
    enum node_type htype;
    rb_parser_string_t *lit;

    if (!head) return tail;
    if (!tail) return head;

    switch (RB_NODE_TYPE(head)) {
      case RB_INTERPOLATED_STRING_NODE:
        rb_node_list_append(&RB_NODE_INTERPOLATED_STRING(head)->parts, tail);
        return head;
      default:
        return NEW_RB_INTERPOLATED_STRING(head, tail, loc);
    }
}

実際に試してみましょう。

var = "str"

# +-- @ InterpolatedStringNode (location: (3,0)-(3,11))*
# |   +-- InterpolatedStringNodeFlags: nil
# |   +-- opening_loc: nil
# |   +-- parts: (length: 2)
# |   |   +-- @ StringNode (location: (0,3)-(5,4))
# |   |   |   +-- StringFlags: nil
# |   |   |   +-- opening_loc: nil
# |   |   |   +-- content_loc: (0,-1)-(0,-1) = ""
# |   |   |   +-- closing_loc: nil
# |   |   |   +-- unescaped: "aaa"
# |   |   +-- @ StringNode (location: (6,3)-(11,10))
# |   |       +-- StringFlags: nil
# |   |       +-- opening_loc: nil
# |   |       +-- content_loc: (0,-1)-(0,-1) = ""
# |   |       +-- closing_loc: nil
# |   |       +-- unescaped: "bbb"
# |   +-- closing_loc: nil
#
# 0004 putobject                              "aaabbb"                  (   3)[Li]
# 0006 pop
"aaa" "bbb"

# +-- @ InterpolatedStringNode (location: (4,0)-(4,11))*
#     +-- InterpolatedStringNodeFlags: nil
#     +-- opening_loc: nil
#     +-- parts: (length: 5)
#     |   +-- @ StringNode (location: (0,4)-(5,4))
#     |   |   +-- StringFlags: nil
#     |   |   +-- opening_loc: nil
#     |   |   +-- content_loc: (0,-1)-(0,-1) = ""
#     |   |   +-- closing_loc: nil
#     |   |   +-- unescaped: "aaa"
#     |   +-- @ StringNode (location: (6,4)-(11,10))
#     |   |   +-- StringFlags: nil
#     |   |   +-- opening_loc: nil
#     |   |   +-- content_loc: (0,-1)-(0,-1) = ""
#     |   |   +-- closing_loc: nil
#     |   |   +-- unescaped: "bbb"
#     |   +-- @ EmbeddedStatementsNode (location: (12,4)-(20,19))
#     |   |   +-- opening_loc: (4,13)-(4,15) = ""
#     |   |   +-- statements:
#     |   |   |   @ StatementsNode (location: (4,15)-(4,18))
#     |   |   |   +-- body: (length: 1)
#     |   |   |       +-- @ LocalVariableReadNode (location: (4,15)-(4,18))*
#     |   |   |           +-- name: :var
#     |   |   |           +-- depth: 0
#     |   |   +-- closing_loc: (4,18)-(4,19) = ""
#     |   +-- @ StringNode (location: (21,4)-(26,25))
#     |   |   +-- StringFlags: nil
#     |   |   +-- opening_loc: nil
#     |   |   +-- content_loc: (0,-1)-(0,-1) = ""
#     |   |   +-- closing_loc: nil
#     |   |   +-- unescaped: "ccc"
#     |   +-- @ StringNode (location: (27,4)-(32,31))
#     |       +-- StringFlags: nil
#     |       +-- opening_loc: nil
#     |       +-- content_loc: (0,-1)-(0,-1) = ""
#     |       +-- closing_loc: nil
#     |       +-- unescaped: "ddd"
#     +-- closing_loc: nil
#
# 0007 putobject                              "aaabbb"                  (   4)[Li]
# 0009 getlocal_WC_0                          var@0
# 0011 dup
# 0012 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0014 anytostring
# 0015 putobject                              "cccddd"
# 0017 concatstrings                          3
"aaa" "bbb" "#{var}" "ccc" "ddd"

良さそうですね。

ヒアドキュメント

さて、ヒアドキュメントと向き合うときがきました。とりあえずヒアドキュメントがどのようなトークンと生成規則からなるのか確認しましょう。

var = "str"

<<STR1
a
#{var}
b
STR1

ログをながめてみると実は"str"などと同じトークン、同じ生成規則であることがわかります。

Next token is token "string literal" (3.0-3.6: )
...
Reducing stack by rule 639 (line 5936):
   $1 = token "string literal" (3.0-3.6: )
   $2 = nterm string_contents (3.6-3.6: NODE_DSTR)
   $3 = token "terminator" (3.0-3.6: )
-> $$ = nterm string1 (3.0-3.6: NODE_DSTR)

これは勝ち確定ですね。実際にノードとバイトコードをみてみましょう。

var = "str"

# +-- @ InterpolatedStringNode (location: (0,3)-(6,6))*
# |   +-- InterpolatedStringNodeFlags: nil
# |   +-- opening_loc: nil
# |   +-- parts: (length: 3)
# |   |   +-- @ StringNode (location: (5,0)-(5,0))
# |   |   |   +-- StringFlags: nil
# |   |   |   +-- opening_loc: nil
# |   |   |   +-- content_loc: (0,-1)-(0,-1) = ""
# |   |   |   +-- closing_loc: nil
# |   |   |   +-- unescaped: "a\n"
# |   |   +-- @ EmbeddedStatementsNode (location: (5,0)-(5,6))
# |   |   |   +-- opening_loc: (5,0)-(5,2) = ""
# |   |   |   +-- statements:
# |   |   |   |   @ StatementsNode (location: (5,2)-(5,5))
# |   |   |   |   +-- body: (length: 1)
# |   |   |   |       +-- @ LocalVariableReadNode (location: (5,2)-(5,5))*
# |   |   |   |           +-- name: :var
# |   |   |   |           +-- depth: 0
# |   |   |   +-- closing_loc: (5,5)-(5,6) = ""
# |   |   +-- @ StringNode (location: (3,6)-(3,6))
# |   |       +-- StringFlags: nil
# |   |       +-- opening_loc: nil
# |   |       +-- content_loc: (0,-1)-(0,-1) = ""
# |   |       +-- closing_loc: nil
# |   |       +-- unescaped: "\nb\n"
# |   +-- closing_loc: nil
#
# 0004 putobject                              "a\n"                     (   3)[Li]
# 0006 getlocal_WC_0                          var@0                     (   5)[Li]
# 0008 dup
# 0009 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0011 anytostring
# 0012 putobject                              "\nb\n"                   (   3)
# 0014 adjuststack                            3
<<STR1
a
#{var}
b
STR1

# +-- @ InterpolatedStringNode (location: (0,9)-(7,6))*
# |   +-- InterpolatedStringNodeFlags: nil
# |   +-- opening_loc: nil
# |   +-- parts: (length: 3)
# |   |   +-- @ StringNode (location: (11,0)-(11,0))
# |   |   |   +-- StringFlags: nil
# |   |   |   +-- opening_loc: nil
# |   |   |   +-- content_loc: (0,-1)-(0,-1) = ""
# |   |   |   +-- closing_loc: nil
# |   |   |   +-- unescaped: "a\n"
# |   |   +-- @ EmbeddedStatementsNode (location: (11,0)-(11,6))
# |   |   |   +-- opening_loc: (11,0)-(11,2) = ""
# |   |   |   +-- statements:
# |   |   |   |   @ StatementsNode (location: (11,2)-(11,5))
# |   |   |   |   +-- body: (length: 1)
# |   |   |   |       +-- @ LocalVariableReadNode (location: (11,2)-(11,5))*
# |   |   |   |           +-- name: :var
# |   |   |   |           +-- depth: 0
# |   |   |   +-- closing_loc: (11,5)-(11,6) = ""
# |   |   +-- @ StringNode (location: (9,7)-(9,7))
# |   |       +-- StringFlags: nil
# |   |       +-- opening_loc: nil
# |   |       +-- content_loc: (0,-1)-(0,-1) = ""
# |   |       +-- closing_loc: nil
# |   |       +-- unescaped: "\nb\n"
# |   +-- closing_loc: nil
#
# 0016 putobject                              "a\n"                     (   9)[Li]
# 0018 getlocal_WC_0                          var@0                     (  11)[Li]
# 0020 dup
# 0021 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0023 anytostring
# 0024 putobject                              "\nb\n"                   (   9)
# 0026 adjuststack                            3
<<-STR1
a
#{var}
b
STR1

# +-- @ InterpolatedStringNode (location: (0,15)-(7,2))*
#     +-- InterpolatedStringNodeFlags: nil
#     +-- opening_loc: nil
#     +-- parts: (length: 5)
#     |   +-- @ StringNode (location: (16,0)-(16,4))
#     |   |   +-- StringFlags: nil
#     |   |   +-- opening_loc: nil
#     |   |   +-- content_loc: (0,-1)-(0,-1) = ""
#     |   |   +-- closing_loc: nil
#     |   |   +-- unescaped: "  a\n"
#     |   +-- @ StringNode (location: (17,0)-(17,2))
#     |   |   +-- StringFlags: nil
#     |   |   +-- opening_loc: nil
#     |   |   +-- content_loc: (0,-1)-(0,-1) = ""
#     |   |   +-- closing_loc: nil
#     |   |   +-- unescaped: "  "
#     |   +-- @ EmbeddedStatementsNode (location: (17,2)-(17,8))
#     |   |   +-- opening_loc: (17,2)-(17,4) = ""
#     |   |   +-- statements:
#     |   |   |   @ StatementsNode (location: (17,4)-(17,7))
#     |   |   |   +-- body: (length: 1)
#     |   |   |       +-- @ LocalVariableReadNode (location: (17,4)-(17,7))*
#     |   |   |           +-- name: :var
#     |   |   |           +-- depth: 0
#     |   |   +-- closing_loc: (17,7)-(17,8) = ""
#     |   +-- @ StringNode (location: (17,8)-(17,9))
#     |   |   +-- StringFlags: nil
#     |   |   +-- opening_loc: nil
#     |   |   +-- content_loc: (0,-1)-(0,-1) = ""
#     |   |   +-- closing_loc: nil
#     |   |   +-- unescaped: "\n"
#     |   +-- @ StringNode (location: (18,0)-(18,4))
#     |       +-- StringFlags: nil
#     |       +-- opening_loc: nil
#     |       +-- content_loc: (0,-1)-(0,-1) = ""
#     |       +-- closing_loc: nil
#     |       +-- unescaped: "  b\n"
#     +-- closing_loc: nil
#
# 0028 putobject                              "  a\n  "                 (  15)[Li]
# 0030 getlocal_WC_0                          var@0                     (  17)[Li]
# 0032 dup
# 0033 objtostring                            <calldata!mid:to_s, argc:0, FCALL|ARGS_SIMPLE>
# 0035 anytostring
# 0036 putobject                              "\n  b\n"                 (  15)
# 0038 concatstrings                          3
<<~STR1
  a
  #{var}
  b
STR1

えーっと、<<~STR1のケースでインデントがそのまま残っているように見えます。

var = "str"

puts <<~STR1
  a
  #{var}
  b
STR1
#=>
#  a
#  str
#  b

<<~のヒアドキュメントの文字列から先頭のインデントを消す処理はheredoc_dedent関数で行っていますが、この関数の実装を全く修正していなかったのでうまく機能していなかったのでしょう。

string1        : tSTRING_BEG string_contents tSTRING_END
                    {
                        $$ = heredoc_dedent(p, $2);
                        if ($$) nd_set_loc($$, &@$);
                    /*% ripper: $:2 %*/
                        if (p->heredoc_indent > 0) {
                        /*% ripper: heredoc_dedent!($:$, INT2NUM(%{p->heredoc_indent})) %*/
                            p->heredoc_indent = 0;
                        }
                    /*% ripper: string_literal!($:$) %*/
                    }
                ;

ここでheredoc_dedent関数をみるまえに、ヒアドキュメントがどのようなトークンに分割されているか確認しておきましょう(#{var}の部分はstring_contentという非終端記号にまとめています)。

ポイントは2つあって

各行が1つのtSTRING_CONTENTになる
式展開がある行は式展開の前と後ろがtSTRING_CONTENTになる

以上を踏まえて既存のheredoc_dedent関数の実装をみてみましょう¹。

static NODE *
heredoc_dedent(struct parser_params *p, NODE *root)
{
    NODE *node, *str_node, *prev_node;
    int indent = p->heredoc_indent;
    rb_parser_string_t *prev_lit = 0;

    if (indent <= 0) return root;
    if (!root) return root;

    prev_node = node = str_node = root;
    if (nd_type_p(root, NODE_LIST)) str_node = RNODE_LIST(root)->nd_head;

    while (str_node) {
        rb_parser_string_t *lit = RNODE_STR(str_node)->string;
        // NODE_STRがその行の最初のノードであればインデントの削除を行う
        if (nd_fl_newline(str_node)) {
            dedent_string(p, lit, indent);
        }
        if (!prev_lit) {
            prev_lit = lit;
        }
        // (2行にわたって)NODE_STRが続くときは1つのNODE_STRにまとめる
        else if (!literal_concat0(p, prev_lit, lit)) {
            return 0;
        }
        else {
            NODE *end = RNODE_LIST(node)->as.nd_end;
            // 2行分のNODE_STRをまとめたので今見ているノードを元のリストから外す
            node = RNODE_LIST(prev_node)->nd_next = RNODE_LIST(node)->nd_next;
            if (!node) {
                if (nd_type_p(prev_node, NODE_DSTR))
                    nd_set_type(prev_node, NODE_STR);
                break;
            }
            RNODE_LIST(node)->as.nd_end = end;
            goto next_str;
        }

        str_node = 0;
        // 次のNODE_STRもしくはNODE_DSTRを探す
        while ((nd_type_p(node, NODE_LIST) || nd_type_p(node, NODE_DSTR)) && (node = RNODE_LIST(prev_node = node)->nd_next) != 0) {
          next_str:
            if (!nd_type_p(node, NODE_LIST)) break;
            if ((str_node = RNODE_LIST(node)->nd_head) != 0) {
                enum node_type type = nd_type(str_node);
                if (type == NODE_STR || type == NODE_DSTR) break;
                prev_lit = 0;
                str_node = 0;
            }
        }
    }
    return root;
}

2行に渡ってNODE_STRが続くときは1つのNODE_STRに結合されるため、以下の4行のヒアドキュメントは3つのノードで表現されます。

#     @ NODE_DSTR (id: 0, line: 1, location: (1,0)-(1,7))*
#     +- string: "a\nb\n"
#     +- nd_next->nd_head:
#     |   @ NODE_EVSTR (id: 6, line: 4, location: (4,6)-(4,12))
#     |   +- nd_body:
#     |   |   @ NODE_VCALL (id: 5, line: 4, location: (4,8)-(4,11))
#     |   |   +- nd_mid: :var
#     |   +- opening_loc: (4,6)-(4,8)
#     |   +- closing_loc: (4,11)-(4,12)
#     +- nd_next->nd_next:
#         @ NODE_LIST (id: 9, line: 4, location: (4,12)-(4,13))
#         +- as.nd_alen: 1
#         +- nd_head:
#         |   @ NODE_STR (id: 8, line: 4, location: (4,12)-(4,13))
#         |   +- string: "\nc\n"
#         +- nd_next:
#             (null node)
<<~STR1
      a
      b
      #{var}
      c
STR1

ノードの書き換え後は文字列ノードの結合をしなくなるので5つのノードで表現されます。

# @ InterpolatedStringNode (location: (1,0)-(1,7))
# +-- InterpolatedStringNodeFlags: nil
# +-- parts: (length: 5)
# |   +-- @ StringNode (location: (2,0)-(3,0))
# |   |   +-- unescaped: "a\n"
# |   +-- @ StringNode (location: (3,0)-(4,0))
# |   |   +-- unescaped: "b\n"
# |   +-- @ EmbeddedStatementsNode (location: (4,6)-(4,12))
# |   +-- @ StringNode (location: (4,12)-(5,0))
# |   |   +-- unescaped: "\n"
# |   +-- @ StringNode (location: (5,0)-(6,0))
# |       +-- unescaped: "c\n"

以上のことを踏まえてheredoc_dedent関数を書き換えます。

static rb_node_t *
heredoc_dedent(struct parser_params *p, rb_node_t *root)
{
    rb_node_t *node;
    const rb_node_list2_t *list;
    int indent = p->heredoc_indent;

    if (indent <= 0) return root;
    if (!root) return root;
    if (RB_NODE_TYPE_P(root, RB_STRING_NODE)) {
        dedent_string(p, RB_NODE_STRING(root)->unescaped, indent);
        return root;
    }
    if (!RB_NODE_TYPE_P(root, RB_INTERPOLATED_STRING_NODE)) {
        rb_bug("unexpected node: %s", ruby_node_name(nd_type(root)));
        UNREACHABLE_RETURN(0);
    }

    list = &RB_NODE_INTERPOLATED_STRING(root)->parts;

    for (size_t i = 0; i < RB_NODE_LIST_LEN(list); i++) {
        node = list->nodes[i];

        if (RB_NODE_TYPE_P(node, RB_STRING_NODE)) {
            if (rb_node_fl_newline(node)) {
                dedent_string(p, RB_NODE_STRING(node)->unescaped, indent);
            }
        }
    }

    return root;
}

実行してみると今度はちゃんと行頭のインデントが削除されるようになりました。

var = "str"
puts <<~STR1
      a
      b
      #{var}
      c
STR1
#=>
# a
# b
# str
# c

空白文字列ノードを削除する

生成されるノードを確認してみるとノードが1つ多いことがわかります。 " "のStringNodeから行頭のインデントを消した結果、""のStringNodeが含まれるようになっています。もともとの実装では1つ前のNODE_STRにマージされて消えていたのでしょう。

# @ InterpolatedStringNode (location: (0,1)-(7,8))*
# +-- parts: (length: 6)
# |   +-- @ StringNode (location: (2,0)-(2,8))*
# |   |   +-- unescaped: "a\n"
# |   +-- @ StringNode (location: (3,0)-(3,8))*
# |   |   +-- unescaped: "b\n"
# |   +-- @ StringNode (location: (4,0)-(4,6))*
# |   |   +-- unescaped: "" <- 空文字列
# |   +-- @ EmbeddedStatementsNode (location: (4,6)-(4,12))
# |   +-- @ StringNode (location: (4,12)-(4,13))
# |   |   +-- unescaped: "\n"
# |   +-- @ StringNode (location: (5,0)-(5,8))*
# |       +-- unescaped: "c\n"

<<~STR1
      a
      b
      #{var}
      c
STR1

heredoc_dedent関数の中でdedent_stringした結果をチェックして、長さが0のときは取り除くようにしましょう。

static rb_node_t *
heredoc_dedent(struct parser_params *p, rb_node_t *root)
{
    rb_node_t *node;
    rb_array_node_t *nd_ary;
    const rb_node_list2_t *list;
    int indent = p->heredoc_indent;

    if (indent <= 0) return root;
    if (!root) return root;
    if (RB_NODE_TYPE_P(root, RB_STRING_NODE)) {
        dedent_string(p, RB_NODE_STRING(root)->unescaped, indent);
        return root;
    }
    if (!RB_NODE_TYPE_P(root, RB_INTERPOLATED_STRING_NODE)) {
        rb_bug("unexpected node: %s", ruby_node_name(nd_type(root)));
        UNREACHABLE_RETURN(0);
    }

    nd_ary = (rb_array_node_t *)NEW_RB_ZARRAY(&NULL_LOC);
    list = &RB_NODE_INTERPOLATED_STRING(root)->parts;

    for (size_t i = 0; i < RB_NODE_LIST_LEN(list); i++) {
        node = list->nodes[i];

        if (RB_NODE_TYPE_P(node, RB_STRING_NODE)) {
            if (rb_node_fl_newline(node)) {
                rb_parser_string_t *str = RB_NODE_STRING(node)->unescaped;
                dedent_string(p, str, indent);
                if (PARSER_STRING_LEN(str) == 0) continue;
            }
        }
        rb_node_list_append(&nd_ary->elements, node);
    }

    rb_node_list_replace(&RB_NODE_INTERPOLATED_STRING(root)->parts, &nd_ary->elements);
    return root;
}

node listを直接扱うとなにかあったときのメモリリークが怖いので一時的にArrayNodeをつくって、そのnode listを利用するようにしています。

生成されるノードから空文字列のStringNodeが削除されようになりました。

#             @ InterpolatedStringNode (location: (0,1)-(7,8))*
#             +-- parts: (length: 5)
#             |   +-- @ StringNode (location: (2,0)-(2,8))*
#             |   |   +-- unescaped: "a\n"
#             |   +-- @ StringNode (location: (3,0)-(3,8))*
#             |   |   +-- unescaped: "b\n"
#             |   +-- @ EmbeddedStatementsNode (location: (4,6)-(4,12))
#             |   +-- @ StringNode (location: (4,12)-(4,13))
#             |   |   +-- unescaped: "\n"
#             |   +-- @ StringNode (location: (5,0)-(5,8))*
#             |       +-- unescaped: "c\n"
<<~STR1
      a
      b
      #{var}
      c
STR1

まとめ

今日の成果です。

%記法が対応済みであることを確認した
文字列の結合が対応済みであることを確認した
ヒアドキュメントに対応した。特に<<~からはじまるヒアドキュメントの行頭の空白が消えるように修正した。

次回は引き続き文字列関連で`cmd`の対応をしていこうと思います。

node変数がNODE_STRでもNODE_DSTRでも動くのは、どちらの構造体も先頭のメンバーの型が同じで、NODE nodeのあとにstruct rb_parser_string *stringが来るという構造をしているからです。↩