From 6dc672fcc756895cd03b1b62883a70bc1a70ca0a Mon Sep 17 00:00:00 2001 From: Hadi <112569860+anotherhadi@users.noreply.github.com> Date: Wed, 24 Sep 2025 19:54:32 +0200 Subject: [PATCH] Change suggestion strategy Signed-off-by: Hadi <112569860+anotherhadi@users.noreply.github.com> --- leak-utils/go.mod | 1 + leak-utils/go.sum | 2 + leak-utils/parquet/suggestions.go | 106 ++++++++++++++++++------------ nix/leak-utils.nix | 2 +- 4 files changed, 68 insertions(+), 43 deletions(-) diff --git a/leak-utils/go.mod b/leak-utils/go.mod index ac3a5f2..e3cc2a9 100644 --- a/leak-utils/go.mod +++ b/leak-utils/go.mod @@ -7,6 +7,7 @@ require ( github.com/charmbracelet/log v0.4.2 github.com/marcboeker/go-duckdb/v2 v2.4.0 github.com/spf13/pflag v1.0.10 + golang.org/x/text v0.28.0 ) require ( diff --git a/leak-utils/go.sum b/leak-utils/go.sum index 695ae59..85298e4 100644 --- a/leak-utils/go.sum +++ b/leak-utils/go.sum @@ -100,6 +100,8 @@ golang.org/x/sync v0.16.0/go.mod h1:1dzgHSNfp02xaA81J2MS99Qcpr2w7fw1gpm99rleRqA= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.35.0 h1:vz1N37gP5bs89s7He8XuIYXpyY0+QlsKmzipCbUtyxI= golang.org/x/sys v0.35.0/go.mod h1:BJP2sWEmIv4KK5OTEluFJCKSidICx8ciO85XgH3Ak8k= +golang.org/x/text v0.28.0 h1:rhazDwis8INMIwQ4tpjLDzUhx6RlXqZNPEM0huQojng= +golang.org/x/text v0.28.0/go.mod h1:U8nCwOR8jO/marOQ0QbDiOngZVEBB7MAiitBuMjXiNU= golang.org/x/tools v0.36.0 h1:kWS0uv/zsvHEle1LbV5LE8QujrxB3wfQyxHfhOk0Qkg= golang.org/x/tools v0.36.0/go.mod h1:WBDiHKJK8YgLHlcQPYQzNCkUxUypCaa5ZegCVutKm+s= golang.org/x/xerrors v0.0.0-20240903120638-7835f813f4da h1:noIWHXmPHxILtqtCOPIhSt0ABwskkZKjD3bXGnZGpNY= diff --git a/leak-utils/parquet/suggestions.go b/leak-utils/parquet/suggestions.go index 2038c67..49e97ec 100644 --- a/leak-utils/parquet/suggestions.go +++ b/leak-utils/parquet/suggestions.go @@ -1,15 +1,20 @@ package parquet import ( + "regexp" "slices" + "strings" + "unicode" + + "golang.org/x/text/unicode/norm" ) -func getSuggestion(col string) string { - col = formatColumnName(col) - knownNames := []string{ +var ( + knownColumnNames = []string{ "date", "phone", "username", + "iban", "address", "email", "postal_code", @@ -28,47 +33,49 @@ func getSuggestion(col string) string { "url", "ip", } - if slices.Contains(knownNames, col) { - return col + + suggestions = map[string]string{ + "user": "username", + "login": "username", + "sex": "gender", + "genre": "gender", + "ipaddress": "ip", + "firstname": "first_name", + "prenom": "first_name", + "lastname": "last_name", + "nom": "last_name", + "fullname": "full_name", + "nomcomplet": "full_name", + "adresse": "address", + "streetaddress": "address", + "ville": "city", + "pays": "country", + "mail": "email", + "zip": "postal_code", + "postalcode": "postal_code", + "zipcode": "postal_code", + "postal": "postal_code", + "codepostal": "postal_code", + "hash": "password_hash", + "hashedpassword": "password_hash", + "hashpassword": "password_hash", + "passwordhashed": "password_hash", + "birthdate": "birth_date", + "dob": "birth_date", + "dateofbirth": "birth_date", } - if col == "user" { - return "username" +) + +func getSuggestion(col string) string { + colFormated := formatColumnName(col) + if slices.Contains(knownColumnNames, colFormated) { + return colFormated } - if col == "login" { - return "username" - } - if col == "sex" { - return "gender" - } - if col == "ip_address" { - return "ip" - } - if col == "password_hashed" { - return "password_hash" - } - if col == "firstname" { - return "first_name" - } - if col == "lastname" { - return "last_name" - } - if col == "fullname" { - return "full_name" - } - if col == "mail" { - return "email" - } - if col == "zip" || col == "postalcode" || col == "zipcode" || col == "postal" || col == "zip_code" { - return "postal_code" - } - if col == "street_address" { - return "address" - } - if col == "hash" || col == "hashed_password" || col == "hash_password" { - return "password_hash" - } - if col == "birthdate" || col == "dob" || col == "date_of_birth" { - return "birth_date" + + col = cleanString(col) + + if val, ok := suggestions[col]; ok { + return val } return "" @@ -79,3 +86,18 @@ func getSuggestion(col string) string { // url: _url, link // address: _address // + +func cleanString(input string) string { + t := norm.NFD.String(input) + var sb strings.Builder + for _, r := range t { + if unicode.Is(unicode.Mn, r) { + continue + } + sb.WriteRune(r) + } + s := strings.ToLower(sb.String()) + reg, _ := regexp.Compile("[^a-z]+") + s = reg.ReplaceAllString(s, "") + return s +} diff --git a/nix/leak-utils.nix b/nix/leak-utils.nix index 6e2653c..e467dfa 100644 --- a/nix/leak-utils.nix +++ b/nix/leak-utils.nix @@ -9,7 +9,7 @@ pname = name; version = "0.1.0"; src = ../leak-utils; - vendorHash = "sha256-rTfbXCiwv/+tVXZmgztt088Zhz0OQaVTfvxXVzw4o4Q="; + vendorHash = "sha256-qgDqmEgL7B8FvoKNwLG0buLmg9Yt54cyWwmXBifgr/g="; buildInputs = [ pkgs.duckdb