Skip to content

Commit b390250

Browse files
committed
Allow disable type auto-detection on CSV Files
Signed-off-by: Pat Buxton <[email protected]>
1 parent 1646b03 commit b390250

File tree

8 files changed

+82
-3
lines changed

8 files changed

+82
-3
lines changed

be/src/exec/file_scanner/csv_scanner.cpp

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -590,7 +590,10 @@ void CSVScanner::_report_rejected_record(const CSVReader::Record& record, const
590590
_state->append_rejected_record_to_file(record.to_string(), err_msg, _curr_reader->filename());
591591
}
592592

593-
static TypeDescriptor get_type_desc(const Slice& field) {
593+
static TypeDescriptor get_type_desc(const Slice& field, const bool& sampleTypes) {
594+
if (!sampleTypes) {
595+
return TypeDescriptor::create_varchar_type(TypeDescriptor::MAX_VARCHAR_LENGTH);
596+
}
594597
StringParser::ParseResult result;
595598

596599
StringParser::string_to_int<int64_t>(field.get_data(), field.get_size(), &result);
@@ -642,7 +645,8 @@ Status CSVScanner::_get_schema(std::vector<SlotDescriptor>* merged_schema) {
642645
_curr_reader->split_record(record, &fields);
643646
for (size_t i = 0; i < fields.size(); i++) {
644647
// column name: $1, $2, $3...
645-
schema.emplace_back(i, fmt::format("${}", i + 1), get_type_desc(fields[i]));
648+
schema.emplace_back(i, fmt::format("${}", i + 1),
649+
get_type_desc(fields[i], _scan_range.params.schema_sample_types));
646650
}
647651
schemas.emplace_back(schema);
648652
i++;
@@ -679,7 +683,8 @@ Status CSVScanner::_get_schema_v2(std::vector<SlotDescriptor>* merged_schema) {
679683
const Slice field(basePtr + column.start_pos, column.length);
680684

681685
// column name: $1, $2, $3...
682-
schema.emplace_back(i, fmt::format("${}", i + 1), get_type_desc(field));
686+
schema.emplace_back(i, fmt::format("${}", i + 1),
687+
get_type_desc(field, _scan_range.params.schema_sample_types));
683688
}
684689
schemas.emplace_back(schema);
685690
i++;

be/test/exec/file_scanner/csv_scanner_test.cpp

Lines changed: 59 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1308,6 +1308,65 @@ TEST_P(CSVScannerTest, test_get_schema) {
13081308
EXPECT_EQ(expected_schema[i].second, schema[i].type().type) << schema[i].col_name();
13091309
}
13101310
}
1311+
1312+
{
1313+
// sample 1 row, enclose ", escape "\"
1314+
std::vector<std::pair<std::string, LogicalType>> expected_schema = {
1315+
{"$1", TYPE_BIGINT}, {"$2", TYPE_VARCHAR}, {"$3", TYPE_DOUBLE}, {"$4", TYPE_BOOLEAN}};
1316+
1317+
std::vector<TBrokerRangeDesc> ranges;
1318+
TBrokerRangeDesc range;
1319+
range.__set_path("./be/test/exec/test_data/csv_scanner/type_sniff.csv");
1320+
range.__set_num_of_columns_from_file(0);
1321+
ranges.push_back(range);
1322+
1323+
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
1324+
params->__set_row_delimiter('\n');
1325+
params->__set_column_separator(',');
1326+
params->__set_enclose('"');
1327+
params->__set_escape('\\');
1328+
params->__set_schema_sample_file_row_count(1);
1329+
auto scanner = create_csv_scanner({}, ranges, params);
1330+
EXPECT_OK(scanner->open());
1331+
std::vector<SlotDescriptor> schema;
1332+
EXPECT_OK(scanner->get_schema(&schema));
1333+
EXPECT_EQ(expected_schema.size(), schema.size());
1334+
1335+
for (size_t i = 0; i < schema.size(); i++) {
1336+
EXPECT_EQ(expected_schema[i].first, schema[i].col_name());
1337+
EXPECT_EQ(expected_schema[i].second, schema[i].type().type) << schema[i].col_name();
1338+
}
1339+
}
1340+
1341+
{
1342+
// sample 1 row, enclose ", escape "\", no type detection
1343+
std::vector<std::pair<std::string, LogicalType>> expected_schema = {
1344+
{"$1", TYPE_VARCHAR}, {"$2", TYPE_VARCHAR}, {"$3", TYPE_VARCHAR}, {"$4", TYPE_VARCHAR}};
1345+
1346+
std::vector<TBrokerRangeDesc> ranges;
1347+
TBrokerRangeDesc range;
1348+
range.__set_path("./be/test/exec/test_data/csv_scanner/type_sniff.csv");
1349+
range.__set_num_of_columns_from_file(0);
1350+
ranges.push_back(range);
1351+
1352+
TBrokerScanRangeParams* params = _obj_pool.add(new TBrokerScanRangeParams());
1353+
params->__set_row_delimiter('\n');
1354+
params->__set_column_separator(',');
1355+
params->__set_enclose('"');
1356+
params->__set_escape('\\');
1357+
params->__set_schema_sample_file_row_count(1);
1358+
params->__set_schema_sample_types(false);
1359+
auto scanner = create_csv_scanner({}, ranges, params);
1360+
EXPECT_OK(scanner->open());
1361+
std::vector<SlotDescriptor> schema;
1362+
EXPECT_OK(scanner->get_schema(&schema));
1363+
EXPECT_EQ(expected_schema.size(), schema.size());
1364+
1365+
for (size_t i = 0; i < schema.size(); i++) {
1366+
EXPECT_EQ(expected_schema[i].first, schema[i].col_name());
1367+
EXPECT_EQ(expected_schema[i].second, schema[i].type().type) << schema[i].col_name();
1368+
}
1369+
}
13111370
}
13121371

13131372
TEST_P(CSVScannerTest, test_flexible_column_mapping) {
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
001,Jok\,e,"21.3",false

docs/en/sql-reference/sql-functions/table-functions/files.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ You can configure the sampling rule using the following parameters:
218218
219219
- `auto_detect_sample_files`: the number of random data files to sample in each batch. By default, the first and last files are selected. Range: `[0, + ∞]`. Default: `2`.
220220
- `auto_detect_sample_rows`: the number of data rows to scan in each sampled data file. Range: `[0, + ∞]`. Default: `500`.
221+
- `auto_detect_types`: (valid for CSV only) - whether to guess the data types of sampled columns, or just assume String. `{true | false}`. Default: `true`.
221222
222223
After the sampling, StarRocks unionizes the columns from all the data files according to these rules:
223224
@@ -227,6 +228,7 @@ After the sampling, StarRocks unionizes the columns from all the data files acco
227228
- Integer columns together with `FLOAT` type columns will be unionized as the DECIMAL type.
228229
- String types are used for unionizing other types.
229230
- Generally, the `STRING` type can be used to unionize all data types.
231+
- If type auto-detection is turned off, all columns will return as `STRING`
230232
231233
You can refer to Example 5.
232234

docs/ja/sql-reference/sql-functions/table-functions/files.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ v3.2 以降、`FILES()` は同じバッチのデータファイルの自動ス
218218
219219
- `auto_detect_sample_files`: 各バッチでサンプリングするランダムなデータファイルの数。デフォルトでは、最初と最後のファイルが選択されます。範囲: `[0, + ∞]`。デフォルト: `2`。
220220
- `auto_detect_sample_rows`: 各サンプリングされたデータファイルでスキャンするデータ行の数。範囲: `[0, + ∞]`。デフォルト: `500`。
221+
- `auto_detect_types`: (CSV のみ有効) - サンプリングされた列のデータ型を推測するか、文字列であると想定するかを指定します。`{true | false}`。デフォルト: `true`。
221222
222223
サンプリング後、StarRocks は次のルールに従ってすべてのデータファイルから列を統合します。
223224
@@ -227,6 +228,7 @@ v3.2 以降、`FILES()` は同じバッチのデータファイルの自動ス
227228
- 整数列と `FLOAT` 型列は DECIMAL 型として統合されます。
228229
- 文字列型は他の型を統合するために使用されます。
229230
- 一般的に、`STRING` 型はすべてのデータ型を統合するために使用できます。
231+
- 型の自動検出がオフになっている場合、すべての列は `STRING` として返されます
230232
231233
例 5 を参照してください。
232234

docs/zh/sql-reference/sql-functions/table-functions/files.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -218,6 +218,7 @@ CSV 格式示例:
218218
219219
- `auto_detect_sample_files`:每批中要采样的随机数据文件数量。默认情况下,选择第一个和最后一个文件。范围:`[0, + ∞]`。默认值:`2`。
220220
- `auto_detect_sample_rows`:每个采样数据文件中要扫描的数据行数。范围:`[0, + ∞]`。默认值:`500`。
221+
- `auto_detect_types`:(仅适用于 CSV 文件)- 是否猜测采样列的数据类型,或者直接假定为字符串。`{true | false}`。默认值:`true`。
221222
222223
采样后,StarRocks 根据以下规则联合化所有数据文件的列:
223224
@@ -227,6 +228,7 @@ CSV 格式示例:
227228
- 整数列与 `FLOAT` 类型列一起将被联合化为 DECIMAL 类型。
228229
- 字符串类型用于联合化其他类型。
229230
- 通常,`STRING` 类型可以用于联合化所有数据类型。
231+
- 如果类型自动检测已关闭,则所有列都将返回 `STRING` 类型。
230232
231233
您可以参考示例 5。
232234

fe/fe-core/src/main/java/com/starrocks/catalog/TableFunctionTable.java

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,6 +128,7 @@ public class TableFunctionTable extends Table {
128128

129129
public static final String PROPERTY_AUTO_DETECT_SAMPLE_FILES = "auto_detect_sample_files";
130130
public static final String PROPERTY_AUTO_DETECT_SAMPLE_ROWS = "auto_detect_sample_rows";
131+
public static final String PROPERTY_AUTO_DETECT_TYPES = "auto_detect_types";
131132

132133
private static final String PROPERTY_FILL_MISMATCH_COLUMN_WITH = "fill_mismatch_column_with";
133134

@@ -178,6 +179,7 @@ public enum FilesTableType {
178179
// for load/query data
179180
private int autoDetectSampleFiles = DEFAULT_AUTO_DETECT_SAMPLE_FILES;
180181
private int autoDetectSampleRows = DEFAULT_AUTO_DETECT_SAMPLE_ROWS;
182+
private bool autoDetectTypes = true;
181183

182184
private List<String> columnsFromPath = new ArrayList<>();
183185
private boolean strictMode = false;
@@ -505,6 +507,10 @@ private void parsePropertiesForLoad(Map<String, String> properties) throws DdlEx
505507
}
506508
}
507509

510+
if (properties.containsKey(PROPERTY_AUTO_DETECT_TYPES)) {
511+
autoDetectTypes = Boolean.parseBoolean(properties.get(PROPERTY_AUTO_DETECT_TYPES));
512+
}
513+
508514
if (properties.containsKey(PROPERTY_CSV_COLUMN_SEPARATOR)) {
509515
csvColumnSeparator = Delimiter.convertDelimiter(properties.get(PROPERTY_CSV_COLUMN_SEPARATOR));
510516
int len = csvColumnSeparator.getBytes(StandardCharsets.UTF_8).length;
@@ -605,6 +611,7 @@ private PGetFileSchemaRequest getGetFileSchemaRequest(List<TBrokerFileStatus> fi
605611
params.setProperties(properties);
606612
params.setSchema_sample_file_count(autoDetectSampleFiles);
607613
params.setSchema_sample_file_row_count(autoDetectSampleRows);
614+
params.setSchema_sample_types(autoDetectTypes);
608615
params.setEnclose(csvEnclose);
609616
params.setEscape(csvEscape);
610617
params.setSkip_header(csvSkipHeader);

gensrc/thrift/PlanNodes.thrift

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -284,6 +284,7 @@ struct TBrokerScanRangeParams {
284284
31: optional i64 schema_sample_file_row_count
285285
32: optional bool flexible_column_mapping
286286
33: optional TFileScanType file_scan_type
287+
34: optional bool schema_sample_types = true
287288
}
288289

289290
// Broker scan range

0 commit comments

Comments
 (0)