Rubyの配列、ハッシュ、文字列のメモリ使用量を見る。そして、固定長テキスト最強説..

※一行のサイズが40byte以下だとあまり良い検証にならないことに気づいたので、再検証予定。

最近よくRubyで大規模テキストの処理をしています。
大規模と言っても、数GBレベルで今どきのマシンであれば全部メモリに乗せれるんじゃ..というサイズになります。

ということで、Ruby(ver.2.3)のArray,Hash,Stringのメモリ使用量を比較してみました。※Windowsで実施。(気が向いたらLinuxでも)

まず入力用テキストを生成。(その後、改行込みで10MBになるように加工）
1行は「000000074,bx,75,M,182,62」な感じになります。
※読込み時は、String,String,Fixnum,String,Fixnum,Fixnumとしています。

open("input.txt","wb"){|w|
  w.puts %w[No Name Age Sex Height Weight].join(",")

  str = "a"
  (0..4000000).each{|i|
    w.puts [sprintf("%09d",i),str.next!,rand(100),rand(2)==0 ? "M" : "F", rand(60)+130, rand(60)+40 ].join(",")
  }
}

次にそのINPUTに対し、メモリ割り当てを実施した結果が以下。

・メモリ割り当て(MB)

	Hash Array	Array Array	Array Hash	Array Class	Hash String	Array String	Big String (<<)	Big String (read once)	Big String (array join)
Hash	18.2	0	86.2	0	18.2	0	0	0	0
String	28.7	28.7	28.7	28.7	28.7	14.4	11.5	10.0	9.3
Array	14.4	17.7	3.3	3.3	0	3.3	0	0	0
Human	0	0	0	31.6	0	0	0	0	0
Fixnum	0	0	0	0	0	0	0	0	0
Symbol	0	0	0	0	0	0	0	0	0
Total	61.3	46.4	118.2	63.6	47.0	17.7	11.5	10.0	9.3

・オブジェクト数

	Hash Array	Array Array	Array Hash	Array Class	Hash String	Array String	Big String (<<)	Big String (read once)	Big String (array join)
Hash	1	0	376503	0	1	0	0	0	0
String	753008	753008	753008	753008	753006	376503	1	1	1
Array	376503	376504	1	0	0	1	0	0	0
Human	0	0	0	376503	0	0	0	0	0
Fixnum	160	160	160	160	0	0	0	0	0
Symbol	0	0	6	0	0	0	0	0	0
Total	1129672	1129672	1129678	1129671	753007	376504	1	1	1

結果を見るに、Hashはメモリ使用量が多く、Arrayはかなり少なくなっています。
Hash+Arrayで1配列で40byte、Array+Hashで1Hashで240byteと6倍の差があります。
大きな配列/Hashだと、Array+Hashで3.3Mbyte、Hash+Arrayで1Hashで18.2Mbyteと同じく6倍の差があります。

よって、メモリが足りない場合はHash+Arrayではなくて、Array+Stringでbsearchを使ってメモリを削減しましょう！
例：arr.bsearch{|e| "00000001" <=> e[0..7]}.split(",") #O(logN)で若干遅いが、メモリ使用率は3分の1以下！

Classは意外と小さく、88byte/Humanとなりました。StringはArrayと同じ40byte(いわゆるRVALUEというやつのようです）でした。
また、Symbol/Fixnumはほとんどパターンがないので、メモリはほとんど使われていません。(Array/Hashの参照のみ）

よって、一定パターンの整数や文字列が中心のテキストの場合は、Arrayで持つと効率的になると考えられますが、様々な文字列が存在するとStringとArrayのオーバーヘッドでメモリ使用率が上がります。

一方で、Big Stringはテキストサイズが、ほぼメモリ使用量になっています。また、<<で文字列を結合していくと余計なメモリが割り当てられることがわかります。

ということで、今は懐かしの固定長テキストを1つのStringで保持して配列操作ライクなことができれば、メモリ使用率最強(Hash+Arrayの6分の1!)とみなして以下のライブラリを作ってみました。（半分ネタですが、暇になったらテスト作ってgemにしたい）
ちなみに、検索/値代入は配列の2～4倍遅いですが、メモリに入ったら勝ち理論なので問題ないと考えています。
bsearchはRuby/Bsearchhttp://0xcc.net/ruby-bsearch/の丸コピです..

class FixedArray
  include Enumerable

  def initialize str,index,len
    @str,@index,@len = str,[index.first,index.size],len
  end

  def [] i
    @str[i*@len+@index[0],@index[1]]
  end

  def []= i,val
    @str[i*@len++@index[0],@index[1]] = val
  end

  def each
    (0...size).each{|i|
      yield @str[i*@len++@index[0],@index[1]]
    }
  end

  def size
    @str.size/@len
  end

  alias :length :size

  #Below here is a copy of Ruby/Bsearch[http://0xcc.net/ruby-bsearch/]
  #
  # The binary search algorithm is extracted from Jon Bentley's
  # Programming Pearls 2nd ed. p.93
  #

  #
  # Return the lower boundary. (inside)
  #
  def bsearch_lower_boundary (range = 0 ... self.length, &block)
    lower  = range.first() -1
    upper = if range.exclude_end? then range.last else range.last + 1 end
    while lower + 1 != upper
      mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
      if yield(self[mid]) < 0
        lower = mid
      else
        upper = mid
      end
    end
    return upper
  end

  #
  # This method searches the FIRST occurrence which satisfies a
  # condition given by a block in binary fashion and return the
  # index of the first occurrence. Return nil if not found.
  #
  def bsearch_first (range = 0 ... self.length, &block)
    boundary = bsearch_lower_boundary(range, &block)
    if boundary >= self.length || yield(self[boundary]) != 0
      return nil
    else
      return boundary
    end
  end

  alias bsearch bsearch_first

  #
  # Return the upper boundary. (outside)
  #
  def bsearch_upper_boundary (range = 0 ... self.length, &block)
    lower  = range.first() -1
    upper = if range.exclude_end? then range.last else range.last + 1 end
    while lower + 1 != upper
      mid = ((lower + upper) / 2).to_i # for working with mathn.rb (Rational)
      if yield(self[mid]) <= 0
        lower = mid
      else
        upper = mid
      end
    end
    return lower + 1 # outside of the matching range.
  end

  #
  # This method searches the LAST occurrence which satisfies a
  # condition given by a block in binary fashion and return the
  # index of the last occurrence. Return nil if not found.
  #
  def bsearch_last (range = 0 ... self.length, &block)
    # `- 1' for canceling `lower + 1' in bsearch_upper_boundary.
    boundary = bsearch_upper_boundary(range, &block) - 1

    if (boundary <= -1 || yield(self[boundary]) != 0)
      return nil
    else
      return boundary
    end
  end

  #
  # Return the search result as a Range object.
  #
  def bsearch_range (range = 0 ... self.length, &block)
    lower = bsearch_lower_boundary(range, &block)
    upper = bsearch_upper_boundary(range, &block)
    return lower ... upper
  end

end

class FixedMatrix
  include Enumerable

  def initialize str,indexes,len
    @str,@indexes,@len = str,indexes,len
    @indexes = @indexes.map{|range| [range.first,range.size]}
  end

  def [] i,j
    ind = @indexes[j]
    @str[i*@len+ind[0],ind[1]]
  end

  def get_row i
    @str[i*@len,@len]
  end

  def []= i,j=0,val
    ind = @indexes[j]
    @str[i*@len+ind[0],ind[1]] = val
  end

  def each
    (0...row_size).each{|i|
      @indexes.each{|ind|
        yield @str[i*@len+ind[0],ind[1]]
      }
    }
  end

  def row_size
    @str.size/@len
  end
end

#Usage
big_str = open("input_fixed.txt","rb").read
fa = FixedArray.new(big_str,0..25,28) #1行28文字で、改行はCRLF。
fm = FixedMatrix.new(big_str,[0..8,10..13,15..16,18..18,20..22,24..25],28) #1行28文字で、1カラム目が0-8文字目、2カラム目が10-13文字目...、改行はCRLF。

fa[fa.bsearch_first{|x| x[0..8] <=> "000202764" }].split(",") #=> ["0002027642,"kmxr","69","F","149","68"]
fm.each{|x| x} # x => "000000074","  bx","75","M","182","62"

・メモリ割り当ての検証コード

require 'objspace'

class Human
  attr_accessor :no,:name,:age,:sex,:height,:weight

  def initialize no,name,age,sex,height,weight
    @no,@name,@age,@sex,@height,@weight = no,name,age,sex,height,weight
  end
end


#計測開始
start = Time.now

hash_arr = {}
arr_arr = []
arr_hash = []
arr_class = []
hash_str = {}
arr_str= []
b_str = ""
open("input.txt","rb").each_with_index{|x,i|
  next if i == 0
  puts "#{i}:#{Time.now - start}" if i % 100000 == 0
  x = x.chomp
  y = x.split(",",-1)
  z = y.map.with_index{|e,j| [0,1,3].include?(j) ? e.freeze : e.to_i }
  
  hash_arr[z[0]] = z[1..-1]
  arr_arr << z
  arr_hash << {}.tap{|h| [:no,:name,:age,:sex,:height,:weight].each_with_index{|x,i| h[x] = z[i]}}
  arr_class << Human.new(*z)
  hash_str[y[0]] = y[1..-1].join(",")
  arr_str << x
  b_str << x
}
b_str2 = open("input.txt","rb").read
b_str3 = arr_str.join("")

puts Time.now - start

GC.start

puts "Hash+Array"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| hash_arr.tap{|x| mem[x.class] << x}.each{|k,v| mem[k.class] << k; mem[v.class] << v; v.each{|x| mem[x.class] << x}}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Array+Array"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| arr_arr.tap{|x| mem[x.class] << x}.each{|x| mem[x.class] << x; x.each{|x| mem[x.class] << x}}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Array+Hash"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| arr_hash.tap{|x| mem[x.class] << x}.each{|h| mem[h.class] << h; h.each{|k,v| mem[k.class] << k; mem[v.class] << v}}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Array+Class"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| arr_class.tap{|x| mem[x.class] << x}.each{|c| mem[c.class] << c;c.instance_variables.each{|v| val = c.instance_variable_get(v);mem[val.class] << val}}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Hash+String"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| hash_str.tap{|x| mem[x.class] << x}.each{|k,v| mem[k.class] << k; mem[v.class] << v}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Array+String"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| arr_str.tap{|x| mem[x.class] << x}.each{|x| mem[x.class] << x}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Big String(<<)"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| b_str.tap{|x| mem[x.class] << x}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")
puts
puts "Big String(read once)"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| b_str2.tap{|x| mem[x.class] << x}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

puts
puts "Big String(array join)"
puts Hash.new{|h,k| h[k] = []}.tap{|mem| b_str3.tap{|x| mem[x.class] << x}}.map{|k,v| [k,v.uniq.size,v.uniq.reduce(0){|sum,e| sum + ObjectSpace.memsize_of(e)} / (1024.0*1024)].join("\t")}.join("\n")

備忘録的な何か

技術ブログ的な何かです

Rubyの配列、ハッシュ、文字列のメモリ使用量を見る。そして、固定長テキスト最強説..