diff --git a/ruby/red-arrow/lib/arrow/array-builder.rb b/ruby/red-arrow/lib/arrow/array-builder.rb index 2ccf50f3c1b..5bb1ee74561 100644 --- a/ruby/red-arrow/lib/arrow/array-builder.rb +++ b/ruby/red-arrow/lib/arrow/array-builder.rb @@ -74,14 +74,23 @@ def detect_builder_info(value, builder_info) detected: true, } when Integer - if value < 0 + builder_info ||= {} + min = builder_info[:min] || value + max = builder_info[:max] || value + min = value if value < min + max = value if value > max + + if builder_info[:builder_type] == :int || value < 0 { - builder: IntArrayBuilder.new, - detected: true, + builder_type: :int, + min: min, + max: max, } else { - builder: UIntArrayBuilder.new, + builder_type: :uint, + min: min, + max: max, } end when Time @@ -150,18 +159,19 @@ def detect_builder_info(value, builder_info) end end when ::Array - sub_builder_info = nil + sub_builder_info = builder_info && builder_info[:value_builder_info] value.each do |sub_value| sub_builder_info = detect_builder_info(sub_value, sub_builder_info) break if sub_builder_info and sub_builder_info[:detected] end if sub_builder_info - sub_builder = sub_builder_info[:builder] - return builder_info unless sub_builder + sub_builder = sub_builder_info[:builder] || create_builder(sub_builder_info) + return sub_builder_info unless sub_builder sub_value_data_type = sub_builder.value_data_type field = Field.new("item", sub_value_data_type) { builder: ListArrayBuilder.new(ListDataType.new(field)), + value_builder_info: sub_builder_info, detected: sub_builder_info[:detected], } else @@ -186,6 +196,35 @@ def create_builder(builder_info) data_type = Decimal256DataType.new(builder_info[:precision], builder_info[:scale]) Decimal256ArrayBuilder.new(data_type) + when :int + min = builder_info[:min] + max = builder_info[:max] + + if GLib::MININT8 <= min && max <= GLib::MAXINT8 + Int8ArrayBuilder.new + elsif GLib::MININT16 <= min && max <= GLib::MAXINT16 + Int16ArrayBuilder.new + elsif GLib::MININT32 <= min && max <= GLib::MAXINT32 + Int32ArrayBuilder.new + elsif GLib::MININT64 <= min && max <= GLib::MAXINT64 + Int64ArrayBuilder.new + else + StringArrayBuilder.new + end + when :uint + max = builder_info[:max] + + if max <= GLib::MAXUINT8 + UInt8ArrayBuilder.new + elsif max <= GLib::MAXUINT16 + UInt16ArrayBuilder.new + elsif max <= GLib::MAXUINT32 + UInt32ArrayBuilder.new + elsif max <= GLib::MAXUINT64 + UInt64ArrayBuilder.new + else + StringArrayBuilder.new + end else nil end diff --git a/ruby/red-arrow/test/test-array-builder.rb b/ruby/red-arrow/test/test-array-builder.rb index 7a2d42e54b3..f629eec6616 100644 --- a/ruby/red-arrow/test/test-array-builder.rb +++ b/ruby/red-arrow/test/test-array-builder.rb @@ -147,44 +147,404 @@ def assert_build(builder_class, raw_array) ]) end - test("lists") do - values = [ - [0, 1, 2], - [3, 4], - ] - array = Arrow::Array.new(values) - data_type = Arrow::ListDataType.new(Arrow::UInt8DataType.new) - assert_equal({ - data_type: data_type, - values: [ - [0, 1, 2], - [3, 4], - ], - }, - { - data_type: array.value_data_type, - values: array.to_a, - }) - end + sub_test_case("nested integer list") do + test("lists") do + values = [ + [0, 1, 2], + [3, 4], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint8) + assert_equal({ + data_type: data_type, + values: [ + [0, 1, 2], + [3, 4], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end - test("lists") do - values = [ - [0, -1, 2], - [3, 4], - ] - array = Arrow::Array.new(values) - data_type = Arrow::ListDataType.new(Arrow::Int8DataType.new) - assert_equal({ - data_type: data_type, - values: [ - [0, -1, 2], - [3, 4], - ], - }, - { - data_type: array.value_data_type, - values: array.to_a, - }) + test("lists boundary") do + values = [ + [0, GLib::MININT8], + [GLib::MAXINT8], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int8) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MININT8], + [GLib::MAXINT8], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int8 underflow") do + values = [ + [0, GLib::MININT8 - 1], + [GLib::MAXINT8], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int16) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MININT8 - 1], + [GLib::MAXINT8], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int8 overflow") do + values = [ + [0, GLib::MAXINT8 + 1], + [GLib::MININT8], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int16) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXINT8 + 1], + [GLib::MININT8], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, GLib::MININT16], + [GLib::MAXINT16], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int16) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MININT16], + [GLib::MAXINT16], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int16 underflow") do + values = [ + [0, GLib::MININT16 - 1], + [GLib::MAXINT16], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int32) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MININT16 - 1], + [GLib::MAXINT16], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int16 overflow") do + values = [ + [0, GLib::MAXINT16 + 1], + [GLib::MININT16], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int32) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXINT16 + 1], + [GLib::MININT16], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, GLib::MININT32], + [GLib::MAXINT32], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int32) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MININT32], + [GLib::MAXINT32], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int32 underflow") do + values = [ + [0, GLib::MININT32 - 1], + [GLib::MAXINT32], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int64) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MININT32 - 1], + [GLib::MAXINT32], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists inferred from int32 overflow") do + values = [ + [0, GLib::MAXINT32 + 1], + [GLib::MININT32], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:int64) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXINT32 + 1], + [GLib::MININT32], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("string fallback from nested int64 array overflow") do + values = [ + [0, GLib::MAXINT64 + 1], + [GLib::MININT64], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:string) + + assert_equal({ + data_type: data_type, + values: [ + ["0", "#{GLib::MAXINT64 + 1}"], + ["#{GLib::MININT64}"], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("string fallback from nested int64 array underflow") do + values = [ + [0, GLib::MININT64 - 1], + [GLib::MAXINT64], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:string) + + assert_equal({ + data_type: data_type, + values: [ + ["0", "#{GLib::MININT64 - 1}"], + ["#{GLib::MAXINT64}"], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, GLib::MAXUINT8], + [GLib::MAXUINT8], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint8) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXUINT8], + [GLib::MAXUINT8], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, GLib::MAXUINT8 + 1], + [GLib::MAXUINT8], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint16) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXUINT8 + 1], + [GLib::MAXUINT8], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, GLib::MAXUINT16], + [GLib::MAXUINT16], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint16) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXUINT16], + [GLib::MAXUINT16], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, GLib::MAXUINT16 + 1], + [GLib::MAXUINT16], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint32) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXUINT16 + 1], + [GLib::MAXUINT16], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists boundary") do + values = [ + [0, GLib::MAXUINT32], + [GLib::MAXUINT32], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint32) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXUINT32], + [GLib::MAXUINT32], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("lists") do + values = [ + [0, GLib::MAXUINT32 + 1], + [GLib::MAXUINT32], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:uint64) + + assert_equal({ + data_type: data_type, + values: [ + [0, GLib::MAXUINT32 + 1], + [GLib::MAXUINT32], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end + + test("string fallback from nested uint64 array overflow") do + values = [ + [0, GLib::MAXUINT64 + 1], + [GLib::MAXUINT64], + ] + array = Arrow::Array.new(values) + data_type = Arrow::ListDataType.new(:string) + + assert_equal({ + data_type: data_type, + values: [ + ["0", "#{GLib::MAXUINT64 + 1}"], + ["#{GLib::MAXUINT64}"], + ], + }, + { + data_type: array.value_data_type, + values: array.to_a, + }) + end end end