# install.packages(c("readr", "dplyr", "stringr"), dependencies = TRUE)
library(readr)
library(dplyr)
library(stringr)

# ---------- CONFIG ----------
in_path  <- "G:/GBIF.ES/Formacion/2025/2025 publicación seguimientos/Name-parser/Invasives/taxon invasive ESP only.tsv"
out_path <- "G:/GBIF.ES/Formacion/2025/2025 publicación seguimientos/Name-parser/Invasives/invasive parsed.tsv"

# ---------- HELPERS ----------
norm_ws <- function(x) {
  x |>
    str_replace_all("\\s+", " ") |>
    str_replace_all("\u00A0", " ") |>  # non-breaking space
    str_trim()
}

# ---------- PARSER ----------
parse_botanical <- function(x) {
  x <- norm_ws(x)
  
  # Pattern capturing:
  # 1: genus (Capitalized)
  # 2: species epithet (lowercase)
  # 3: infraspecific rank (optional)
  # 4: infraspecific epithet (optional)
  # 5: authorship (remaining string)
  pattern <- "^([A-Z][a-zA-Z\\-]+)\\s+([a-z][a-z\\-]+)?\\s*(subsp\\.|ssp\\.|var\\.|subvar\\.|f\\.|forma|subf\\.)?\\s*([a-z][a-z\\-]+)?\\s*(.*)?$"
  
  m <- str_match(x, pattern)
  
  tibble(
    genus = m[, 2],
    species_epithet = m[, 3],
    infraspecific_rank = m[, 4],
    infraspecific_epithet = m[, 5],
    scientificName_authorship = ifelse(m[, 6] == "", NA, str_trim(m[, 6])),
    taxonRank = case_when(
      !is.na(m[, 4]) ~ tolower(str_replace(m[, 4], "\\.$", "")),
      !is.na(m[, 3]) ~ "species",
      !is.na(m[, 2]) ~ "genus",
      TRUE ~ NA_character_
    )
  )
}

# ---------- LOAD ----------
df <- read_tsv(in_path, show_col_types = FALSE, locale = locale(encoding = "UTF-8"))

# if the file has only one column, ensure it’s called “name”
if (!"scientificName" %in% names(df) && !"name" %in% names(df)) {
  names(df)[1] <- "name"
}
if ("scientificName" %in% names(df)) {
  df <- rename(df, name = scientificName)
}

# ---------- PARSE ----------
out <- df %>%
  mutate(name = norm_ws(name)) %>%
  bind_cols(parse_botanical(.$name))

# ---------- SAVE ----------
write_tsv(out, out_path)
cat("✅ Parsed file saved to:", out_path, "\n")
