new action: countLinesWithAt (both txt/csv & parquet) (#5)

Signed-off-by: Hadi <112569860+anotherhadi@users.noreply.github.com>
This commit is contained in:
Hadi
2025-09-24 21:12:31 +02:00
parent 7ee82425c2
commit ba5ef5c0f3
4 changed files with 95 additions and 3 deletions

View File

@@ -33,6 +33,7 @@ func main() {
"mergeFiles",
"deleteFirstLines",
"deleteLastLines",
"countLinesWithAt",
"removeUrlSchemeFromUlp",
}
@@ -162,6 +163,24 @@ func main() {
log.Fatal("Failed to remove last lines", "error", err)
}
return
case "countLinesWithAt":
var inputFile *string = flag.StringP("input", "i", "", "Input file")
var noColors *bool = flag.Bool("no-colors", false, "Remove all colors")
var debug *bool = flag.Bool("debug", false, "Debug mode")
flag.Parse()
if *inputFile == "" {
log.Fatal("Input file are required")
}
if *noColors {
settings.DisableColors()
}
lu.Debug = *debug
countAt, countLine, err := misc.CountLinesWithAt(lu, *inputFile)
if err != nil {
log.Fatal("Failed to count @", "error", err)
}
fmt.Println(settings.Base.Render("There are"), settings.Accent.Render(fmt.Sprintf("%d", countAt)), settings.Base.Render("lines with @ out of"), settings.Accent.Render(fmt.Sprintf("%d", countLine)), settings.Base.Render("lines in"), settings.Accent.Render(*inputFile))
return
case "removeUrlSchemeFromUlp":
var inputFile *string = flag.StringP("input", "i", "", "Input Parquet file")
var noColors *bool = flag.Bool("no-colors", false, "Remove all colors")

View File

@@ -0,0 +1,73 @@
package misc
import (
"bufio"
"fmt"
"os"
"strings"
"github.com/anotherhadi/eleakxir/leak-utils/parquet"
"github.com/anotherhadi/eleakxir/leak-utils/settings"
)
// Count the line with "@" in a file
func CountLinesWithAt(lu settings.LeakUtils, inputFile string) (nAt, nLines int, err error) {
if strings.HasSuffix(inputFile, ".parquet") {
return countRowsWithAtInParquet(lu, inputFile)
}
in, err := os.Open(inputFile)
if err != nil {
return 0, 0, err
}
defer in.Close()
scanner := bufio.NewScanner(in)
countAt := 0
countLine := 0
for scanner.Scan() {
line := scanner.Text()
if strings.Contains(line, "@") {
countAt++
}
countLine++
}
if err := scanner.Err(); err != nil {
return 0, 0, err
}
return countAt, countLine, nil
}
func countRowsWithAtInParquet(lu settings.LeakUtils, inputFile string) (nAt, nLine int, err error) {
cols, err := parquet.GetColumns(lu.Db, inputFile)
if err != nil {
return 0, 0, err
}
if len(cols) == 0 {
return 0, 0, nil
}
whereParts := []string{}
for _, col := range cols {
whereParts = append(whereParts, fmt.Sprintf("%s LIKE '%%@%%'", col))
}
whereClause := strings.Join(whereParts, " OR ")
query := fmt.Sprintf("SELECT COUNT(*) FROM read_parquet('%s') WHERE %s", inputFile, whereClause)
var countAt int
err = lu.Db.QueryRow(query).Scan(&countAt)
if err != nil {
return 0, 0, err
}
query = fmt.Sprintf("SELECT COUNT(*) FROM read_parquet('%s')", inputFile)
var countLine int
err = lu.Db.QueryRow(query).Scan(&countLine)
if err != nil {
return 0, 0, err
}
return countAt, countLine, nil
}

View File

@@ -252,7 +252,7 @@ func GetParquet(db *sql.DB, inputFile string) (parquet *Parquet, err error) {
parquet = &Parquet{}
parquet.Filepath = inputFile
parquet.Columns, err = getColumns(db, inputFile)
parquet.Columns, err = GetColumns(db, inputFile)
if err != nil {
return
}

View File

@@ -7,8 +7,8 @@ import (
"strings"
)
// getColumns retrieves the column names from the Parquet file.
func getColumns(db *sql.DB, filepath string) ([]string, error) {
// GetColumns retrieves the column names from the Parquet file.
func GetColumns(db *sql.DB, filepath string) ([]string, error) {
// Create a view from the parquet file
query := fmt.Sprintf("CREATE OR REPLACE VIEW parquet_view AS SELECT * FROM read_parquet('%s')", filepath)
_, err := db.Exec(query)