Skip to content

Commit b6f853f

Browse files
authored
feat: Add case_insensitive param to some Elastic DSL queries (#6005)
* feat: Add case_insensitive param to some Elastic DSL queries Signed-off-by: Darkheir <[email protected]> * Apply PR review sugestion Signed-off-by: Darkheir <[email protected]> --------- Signed-off-by: Darkheir <[email protected]>
1 parent 0358ca3 commit b6f853f

File tree

12 files changed

+234
-17
lines changed

12 files changed

+234
-17
lines changed

docs/reference/es_compatible_api.md

Lines changed: 42 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -695,10 +695,11 @@ When working on text, it is recommended to only use `term` queries on fields con
695695

696696
#### Supported Parameters
697697

698-
| Variable | Type | Description | Default |
699-
| -------- | -------- | ---------------------------------------------------------------------------- | ------- |
700-
| `value` | String | Term value. This is the string representation of a token after tokenization. | - |
701-
| `boost` | `Number` | Multiplier boost for score computation | 1.0 |
698+
| Variable | Type | Description | Default |
699+
| ------------------ | ------- | ---------------------------------------------------------------------------- | ------- |
700+
| `value` | String | Term value. This is the string representation of a token after tokenization. | - |
701+
| `boost` | Number | Multiplier boost for score computation | 1.0 |
702+
| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false |
702703

703704

704705

@@ -763,9 +764,10 @@ Returns documents that contain a specific prefix in a provided field.
763764

764765
#### Supported Parameters
765766

766-
| Variable | Type | Description | Default |
767-
| -------- | ------ | ----------------------------------------------- | ------- |
768-
| `value` | String | Beginning characters of terms you wish to find. | - |
767+
| Variable | Type | Description | Default |
768+
| ------------------ | ------- | ---------------------------------------------------- | ------- |
769+
| `value` | String | Beginning characters of terms you wish to find. | - |
770+
| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false |
769771

770772
### `wildcard`
771773

@@ -791,9 +793,39 @@ Returns documents that contain terms matching a wildcard pattern:
791793

792794
#### Supported Parameters
793795

794-
| Variable | Type | Description | Default |
795-
| -------- | ------ | -------------------------------------------- | ------- |
796-
| `value` | String | Wildcard pattern for terms you wish to find. | - |
796+
| Variable | Type | Description | Default |
797+
| ------------------ | ------- | ---------------------------------------------------- | ------- |
798+
| `value` | String | Wildcard pattern for terms you wish to find. | - |
799+
| `boost` | Number | Multiplier boost for score computation. | 1.0 |
800+
| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false |
801+
802+
803+
### `regexp`
804+
805+
[Elasticsearch reference documentation](https://www.elastic.co/guide/en/elasticsearch/reference/8.8/query-dsl-regexp-query.html)
806+
807+
Returns documents that contain terms matching a regular expression.
808+
809+
#### Example
810+
811+
```json
812+
{
813+
"query": {
814+
"regexp": {
815+
"author.login" {
816+
"value": "adm.*n",
817+
}
818+
}
819+
}
820+
}
821+
```
822+
823+
#### Supported Parameters
824+
825+
| Variable | Type | Description | Default |
826+
| ------------------ | ------- | ---------------------------------------------------- | ------- |
827+
| `value` | String | Wildcard pattern for terms you wish to find. | - |
828+
| `case_insensitive` | Boolean | Allows ASCII case insensitive matching of the value. | false |
797829

798830

799831
### About the `lenient` argument

quickwit/quickwit-query/src/elastic_query_dsl/prefix_query.rs

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,8 @@ pub(crate) struct PrefixQuery {
2929
#[serde(deny_unknown_fields)]
3030
pub struct PrefixQueryParams {
3131
value: String,
32+
#[serde(default)]
33+
case_insensitive: bool,
3234
}
3335

3436
impl ConvertibleToQueryAst for PrefixQuery {
@@ -45,6 +47,7 @@ impl ConvertibleToQueryAst for PrefixQuery {
4547
field: self.field,
4648
value: wildcard,
4749
lenient: true,
50+
case_insensitive: self.params.case_insensitive,
4851
}
4952
.into())
5053
}
@@ -64,7 +67,10 @@ impl From<OneFieldMap<StringOrStructForSerialization<PrefixQueryParams>>> for Pr
6467

6568
impl From<String> for PrefixQueryParams {
6669
fn from(value: String) -> PrefixQueryParams {
67-
PrefixQueryParams { value }
70+
PrefixQueryParams {
71+
value,
72+
case_insensitive: false,
73+
}
6874
}
6975
}
7076

quickwit/quickwit-query/src/elastic_query_dsl/regex_query.rs

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,16 +22,22 @@ use crate::query_ast::{QueryAst, RegexQuery as AstRegexQuery};
2222
#[serde(deny_unknown_fields)]
2323
pub struct RegexQueryParams {
2424
value: String,
25-
// we could probably add case_insensitive
25+
#[serde(default)]
26+
case_insensitive: bool,
2627
}
2728

2829
pub type RegexQuery = OneFieldMap<RegexQueryParams>;
2930

3031
impl ConvertibleToQueryAst for RegexQuery {
3132
fn convert_to_query_ast(self) -> anyhow::Result<QueryAst> {
33+
let regex = if self.value.case_insensitive {
34+
format!("(?i){}", self.value.value)
35+
} else {
36+
self.value.value.clone()
37+
};
3238
Ok(AstRegexQuery {
3339
field: self.field,
34-
regex: self.value.value,
40+
regex,
3541
}
3642
.into())
3743
}

quickwit/quickwit-query/src/elastic_query_dsl/term_query.rs

Lines changed: 18 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ impl From<String> for TermQueryParams {
4141
TermQueryParams {
4242
value: query,
4343
boost: None,
44+
case_insensitive: false,
4445
}
4546
}
4647
}
@@ -70,6 +71,8 @@ pub struct TermQueryParams {
7071
pub value: String,
7172
#[serde(default)]
7273
pub boost: Option<NotNaNf32>,
74+
#[serde(default)]
75+
case_insensitive: bool,
7376
}
7477

7578
pub fn term_query_from_field_value(field: impl ToString, value: impl ToString) -> TermQuery {
@@ -78,6 +81,7 @@ pub fn term_query_from_field_value(field: impl ToString, value: impl ToString) -
7881
value: TermQueryParams {
7982
value: value.to_string(),
8083
boost: None,
84+
case_insensitive: false,
8185
},
8286
}
8387
}
@@ -90,7 +94,20 @@ impl From<TermQuery> for ElasticQueryDslInner {
9094

9195
impl ConvertibleToQueryAst for TermQuery {
9296
fn convert_to_query_ast(self) -> anyhow::Result<QueryAst> {
93-
let TermQueryParams { value, boost } = self.value;
97+
let TermQueryParams {
98+
value,
99+
boost,
100+
case_insensitive,
101+
} = self.value;
102+
if case_insensitive {
103+
let ci_value = format!("(?i){}", regex::escape(&value));
104+
let term_ast: QueryAst = query_ast::RegexQuery {
105+
field: self.field,
106+
regex: ci_value,
107+
}
108+
.into();
109+
return Ok(term_ast.boost(boost));
110+
}
94111
let term_ast: QueryAst = query_ast::TermQuery {
95112
field: self.field,
96113
value,

quickwit/quickwit-query/src/elastic_query_dsl/wildcard_query.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,8 @@ pub struct WildcardQueryParams {
3232
value: String,
3333
#[serde(default)]
3434
pub boost: Option<NotNaNf32>,
35+
#[serde(default)]
36+
case_insensitive: bool,
3537
}
3638

3739
impl ConvertibleToQueryAst for WildcardQuery {
@@ -40,6 +42,7 @@ impl ConvertibleToQueryAst for WildcardQuery {
4042
field: self.field,
4143
value: self.params.value,
4244
lenient: true,
45+
case_insensitive: self.params.case_insensitive,
4346
}
4447
.into();
4548
Ok(wildcard_ast.boost(self.params.boost))
@@ -60,7 +63,11 @@ impl From<OneFieldMap<StringOrStructForSerialization<WildcardQueryParams>>> for
6063

6164
impl From<String> for WildcardQueryParams {
6265
fn from(value: String) -> WildcardQueryParams {
63-
WildcardQueryParams { value, boost: None }
66+
WildcardQueryParams {
67+
value,
68+
boost: None,
69+
case_insensitive: false,
70+
}
6471
}
6572
}
6673

quickwit/quickwit-query/src/query_ast/user_input_query.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,6 +293,7 @@ fn convert_user_input_literal(
293293
field: field_name,
294294
value: phrase.clone(),
295295
lenient,
296+
case_insensitive: false,
296297
}
297298
.into()
298299
} else {

quickwit/quickwit-query/src/query_ast/wildcard_query.rs

Lines changed: 58 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ pub struct WildcardQuery {
3232
pub value: String,
3333
/// Support missing fields
3434
pub lenient: bool,
35+
pub case_insensitive: bool,
3536
}
3637

3738
impl From<WildcardQuery> for QueryAst {
@@ -133,6 +134,11 @@ impl WildcardQuery {
133134
let tokenizer_name = text_field_indexing.tokenizer();
134135
let regex =
135136
sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?;
137+
let regex = if self.case_insensitive {
138+
format!("(?i){}", regex)
139+
} else {
140+
regex
141+
};
136142

137143
Ok((field, None, regex))
138144
}
@@ -147,6 +153,11 @@ impl WildcardQuery {
147153
let tokenizer_name = text_field_indexing.tokenizer();
148154
let regex =
149155
sub_query_parts_to_regex(sub_query_parts, tokenizer_name, tokenizer_manager)?;
156+
let regex = if self.case_insensitive {
157+
format!("(?i){}", regex)
158+
} else {
159+
regex
160+
};
150161

151162
let mut term_for_path = Term::from_field_json_path(
152163
field,
@@ -219,6 +230,7 @@ mod tests {
219230
field: "text_field".to_string(),
220231
value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
221232
lenient: false,
233+
case_insensitive: false,
222234
};
223235

224236
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
@@ -261,6 +273,7 @@ mod tests {
261273
field: "text_field".to_string(),
262274
value: "MyString Wh1ch\\?a.nOrMal Tokenizer would\\*cut".to_string(),
263275
lenient: false,
276+
case_insensitive: false,
264277
};
265278

266279
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
@@ -305,6 +318,7 @@ mod tests {
305318
field: "json_field.Inner.Fie*ld".to_string(),
306319
value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
307320
lenient: false,
321+
case_insensitive: false,
308322
};
309323

310324
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
@@ -347,6 +361,7 @@ mod tests {
347361
field: "my_missing_field".to_string(),
348362
value: "My query value*".to_string(),
349363
lenient: false,
364+
case_insensitive: false,
350365
};
351366
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
352367
let schema = single_text_field_schema("my_field", "whitespace");
@@ -359,4 +374,47 @@ mod tests {
359374
};
360375
assert_eq!(missing_field_full_path, "my_missing_field");
361376
}
377+
378+
#[test]
379+
fn test_wildcard_query_to_regex_on_text_case_insensitive() {
380+
let query = WildcardQuery {
381+
field: "text_field".to_string(),
382+
value: "MyString Wh1ch?a.nOrMal Tokenizer would*cut".to_string(),
383+
lenient: false,
384+
case_insensitive: true,
385+
};
386+
387+
let tokenizer_manager = create_default_quickwit_tokenizer_manager();
388+
for tokenizer in ["raw", "whitespace"] {
389+
let mut schema_builder = TantivySchema::builder();
390+
let text_options = TextOptions::default()
391+
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
392+
schema_builder.add_text_field("text_field", text_options);
393+
let schema = schema_builder.build();
394+
395+
let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
396+
assert_eq!(regex, "(?i)MyString Wh1ch.a\\.nOrMal Tokenizer would.*cut");
397+
assert!(path.is_none());
398+
}
399+
400+
for tokenizer in [
401+
"raw_lowercase",
402+
"lowercase",
403+
"default",
404+
"en_stem",
405+
"chinese_compatible",
406+
"source_code_default",
407+
"source_code_with_hex",
408+
] {
409+
let mut schema_builder = TantivySchema::builder();
410+
let text_options = TextOptions::default()
411+
.set_indexing_options(TextFieldIndexing::default().set_tokenizer(tokenizer));
412+
schema_builder.add_text_field("text_field", text_options);
413+
let schema = schema_builder.build();
414+
415+
let (_field, path, regex) = query.to_regex(&schema, &tokenizer_manager).unwrap();
416+
assert_eq!(regex, "(?i)mystring wh1ch.a\\.normal tokenizer would.*cut");
417+
assert!(path.is_none());
418+
}
419+
}
362420
}

quickwit/rest-api-tests/scenarii/es_compatibility/0006-term_query.yaml

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,3 @@
1-
# case_insensitive not supported.
2-
engines: ["elasticsearch"]
31
params:
42
# this overrides the query sent in body apparently
53
size: 3

quickwit/rest-api-tests/scenarii/es_compatibility/0029-wildcard.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,25 @@ expected:
2626
hits:
2727
total:
2828
value: 2
29+
---
30+
json:
31+
query:
32+
wildcard:
33+
repo.name:
34+
value: RUS*
35+
case_insensitive: true
36+
expected:
37+
hits:
38+
total:
39+
value: 1
40+
---
41+
json:
42+
query:
43+
wildcard:
44+
repo.name:
45+
value: RUS*
46+
case_insensitive: false
47+
expected:
48+
hits:
49+
total:
50+
value: 0

quickwit/rest-api-tests/scenarii/es_compatibility/0030-prefix.yaml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,3 +26,25 @@ expected:
2626
hits:
2727
total:
2828
value: 2
29+
---
30+
json:
31+
query:
32+
prefix:
33+
repo.name:
34+
value: RUST
35+
case_insensitive: true
36+
expected:
37+
hits:
38+
total:
39+
value: 1
40+
---
41+
json:
42+
query:
43+
prefix:
44+
repo.name:
45+
value: RUST
46+
case_insensitive: false
47+
expected:
48+
hits:
49+
total:
50+
value: 0

0 commit comments

Comments
 (0)