diff --git a/datafusion/spark/src/function/string/mod.rs b/datafusion/spark/src/function/string/mod.rs index 8859beca77996..7bcdac5d85474 100644 --- a/datafusion/spark/src/function/string/mod.rs +++ b/datafusion/spark/src/function/string/mod.rs @@ -25,6 +25,7 @@ pub mod ilike; pub mod length; pub mod like; pub mod luhn_check; +pub mod soundex; pub mod space; pub mod substring; @@ -45,6 +46,7 @@ make_udf_function!(format_string::FormatStringFunc, format_string); make_udf_function!(space::SparkSpace, space); make_udf_function!(substring::SparkSubstring, substring); make_udf_function!(base64::SparkUnBase64, unbase64); +make_udf_function!(soundex::SparkSoundex, soundex); pub mod expr_fn { use datafusion_functions::export_functions; @@ -110,6 +112,7 @@ pub mod expr_fn { "Decodes the input string `str` from a base64 string into binary data.", str )); + export_functions!((soundex, "Returns Soundex code of the string.", str)); } pub fn functions() -> Vec> { @@ -127,5 +130,6 @@ pub fn functions() -> Vec> { space(), substring(), unbase64(), + soundex(), ] } diff --git a/datafusion/spark/src/function/string/soundex.rs b/datafusion/spark/src/function/string/soundex.rs new file mode 100644 index 0000000000000..1d23ca75456a2 --- /dev/null +++ b/datafusion/spark/src/function/string/soundex.rs @@ -0,0 +1,155 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +use arrow::array::{ArrayRef, OffsetSizeTrait, StringArray}; +use arrow::datatypes::DataType; +use datafusion_common::cast::{as_generic_string_array, as_string_view_array}; +use datafusion_common::utils::take_function_args; +use datafusion_common::{Result, exec_err}; +use datafusion_expr::{ColumnarValue, Signature, Volatility}; +use datafusion_expr::{ScalarFunctionArgs, ScalarUDFImpl}; +use datafusion_functions::utils::make_scalar_function; +use std::any::Any; +use std::sync::Arc; + +/// Spark-compatible `soundex` expression +/// +#[derive(Debug, PartialEq, Eq, Hash)] +pub struct SparkSoundex { + signature: Signature, +} + +impl Default for SparkSoundex { + fn default() -> Self { + Self::new() + } +} + +impl SparkSoundex { + pub fn new() -> Self { + Self { + signature: Signature::string(1, Volatility::Immutable), + } + } +} + +impl ScalarUDFImpl for SparkSoundex { + fn as_any(&self) -> &dyn Any { + self + } + + fn name(&self) -> &str { + "soundex" + } + + fn signature(&self) -> &Signature { + &self.signature + } + + fn return_type(&self, arg_types: &[DataType]) -> Result { + match &arg_types[0] { + DataType::LargeUtf8 => Ok(DataType::LargeUtf8), + _ => Ok(DataType::Utf8), + } + } + + fn invoke_with_args(&self, args: ScalarFunctionArgs) -> Result { + make_scalar_function(spark_soundex_inner, vec![])(&args.args) + } +} + +fn spark_soundex_inner(arg: &[ArrayRef]) -> Result { + let [array] = take_function_args("soundex", arg)?; + match &array.data_type() { + DataType::Utf8 => soundex_array::(array), + DataType::LargeUtf8 => soundex_array::(array), + DataType::Utf8View => soundex_view(array), + other => { + exec_err!("unsupported data type {other:?} for function `soundex`") + } + } +} + +fn soundex_array(array: &ArrayRef) -> Result { + let str_array = as_generic_string_array::(array)?; + let result = str_array + .iter() + .map(|s| s.map(compute_soundex)) + .collect::(); + Ok(Arc::new(result)) +} + +fn soundex_view(str_view: &ArrayRef) -> Result { + let str_array = as_string_view_array(str_view)?; + let result = str_array + .iter() + .map(|opt_str| opt_str.map(compute_soundex)) + .collect::(); + Ok(Arc::new(result) as ArrayRef) +} + +fn classify_char(c: char) -> Option { + match c.to_ascii_uppercase() { + 'B' | 'F' | 'P' | 'V' => Some('1'), + 'C' | 'G' | 'J' | 'K' | 'Q' | 'S' | 'X' | 'Z' => Some('2'), + 'D' | 'T' => Some('3'), + 'L' => Some('4'), + 'M' | 'N' => Some('5'), + 'R' => Some('6'), + _ => None, + } +} + +fn is_ignored(c: char) -> bool { + matches!(c.to_ascii_uppercase(), 'H' | 'W') +} + +fn compute_soundex(s: &str) -> String { + let mut chars = s.chars(); + + let first_char = match chars.next() { + Some(c) if c.is_ascii_alphabetic() => c.to_ascii_uppercase(), + _ => return s.to_string(), + }; + + let mut soundex_code = String::with_capacity(4); + soundex_code.push(first_char); + let mut last_code = classify_char(first_char); + + for c in chars { + if soundex_code.len() >= 4 { + break; + } + + if is_ignored(c) { + continue; + } + + match classify_char(c) { + Some(code) => { + if last_code != Some(code) { + soundex_code.push(code); + } + last_code = Some(code); + } + None => { + last_code = None; + } + } + } + format!("{soundex_code:0<4}") +} diff --git a/datafusion/sqllogictest/test_files/spark/string/soundex.slt b/datafusion/sqllogictest/test_files/spark/string/soundex.slt index f0c46e10fd1de..ec85c4bd40b24 100644 --- a/datafusion/sqllogictest/test_files/spark/string/soundex.slt +++ b/datafusion/sqllogictest/test_files/spark/string/soundex.slt @@ -15,13 +15,187 @@ # specific language governing permissions and limitations # under the License. -# This file was originally created by a porting script from: -# https://github.com/lakehq/sail/tree/43b6ed8221de5c4c4adbedbb267ae1351158b43c/crates/sail-spark-connect/tests/gold_data/function -# This file is part of the implementation of the datafusion-spark function library. -# For more information, please see: -# https://github.com/apache/datafusion/issues/15914 - -## Original Query: SELECT soundex('Miller'); -## PySpark 3.5.5 Result: {'soundex(Miller)': 'M460', 'typeof(soundex(Miller))': 'string', 'typeof(Miller)': 'string'} -#query -#SELECT soundex('Miller'::string); +query T +SELECT soundex('Miller'); +---- +M460 + +query T +SELECT soundex(NULL); +---- +NULL + +query T +SELECT soundex(''); +---- +(empty) + +query T +SELECT soundex('Apache Spark'); +---- +A122 + +query T +SELECT soundex('123'); +---- +123 + +query T +SELECT soundex('a123'); +---- +A000 + +query T +SELECT soundex('Datafusion'); +---- +D312 + +query T +SELECT soundex('Ashcroft'); +---- +A261 + +query T +SELECT soundex('B1B'); +---- +B100 + +query T +SELECT soundex('B B'); +---- +B100 + +query T +SELECT soundex('BAB'); +---- +B100 + +query T +SELECT soundex('#hello'); +---- +#hello + +query T +SELECT soundex(' hello'); +---- + hello + +query T +SELECT soundex('\thello'); +---- +\thello + +query T +SELECT soundex('😀hello'); +---- +😀hello + +query T +SELECT soundex('123'); +---- +123 + +query T +SELECT soundex('1abc'); +---- +1abc + +query T +SELECT soundex('A'); +---- +A000 + +query T +SELECT soundex('BFPV'); +---- +B000 + +query T +SELECT soundex('Robert'); +---- +R163 + +query T +SELECT soundex('Rupert'); +---- +R163 + +query T +SELECT soundex(NULL); +---- +NULL + +query T +SELECT soundex(''); +---- +(empty) + +query T +SELECT soundex('robert'); +---- +R163 + +query T +SELECT soundex('rObErT'); +---- +R163 + +query T +SELECT soundex('Müller'); +---- +M460 + +query T +SELECT soundex('Abcdefghijklmnop'); +---- +A123 + +query T +SELECT soundex('Lloyd'); +---- +L300 + +query T +SELECT soundex('BWB'); +---- +B000 + +query T +SELECT soundex('BHB'); +---- +B000 + +query T +SELECT soundex('Tymczak'); +---- +T522 + +query T +SELECT soundex('Aeiou'); +---- +A000 + +query T +SELECT soundex('1Robert'); +---- +1Robert + +query T +SELECT soundex('Smith-Jones'); +---- +S532 + +query T +SELECT soundex('#'); +---- +# + +query T +SELECT soundex('\nhello'); +---- +\nhello + +query T +SELECT concat(soundex(' '), 'Spark') +---- + Spark