export const PRODUCT_STOPWORDS = new Set([
  "bag",
  "bagged",
  "bunch",
  "bunched",
  "bunches",
  "bulk",
  "cal-o",
  "calo",
  "no",
  "insulated",
  "ncert",
  "box",
  "girl",
  "org",
  "organic",
  "nonorganic",
  "gmo",
  "herbs",
  "citrus",
  "bundle",
  "carton",
  "case",
  "net",
  "clamshell",
  "container",
  "count",
  "crate",
  "ct",
  "dozen",
  "each",
  "f",
  "ft",
  "fcy",
  "jbo",
  "jumbo",
  "med",
  "medium",
  "large",
  "lrg",
  "gallon",
  "jar",
  "kg",
  "lb",
  "liter",
  "loaf",
  "pack",
  "packet",
  "piece",
  "pint",
  "pound",
  "pouch",
  "quart",
  "roll",
  "sack",
  "sheet",
  "tote",
  "unit",
  "x/xf",
  "x",
  "xf",
  "fxf",
  "us",
  "usa",
  "bc",
  "on",
  "ab",
  "wa",
  "ca",
  "az",
  "ir",
  "upc",
  "fcy",
  "mx",
  "mxc",
  "mxco",
  "vf",
  "xoz", // occurs after removing numerical chars from unit info like "1x12oz"
  "xlb",
  "xct",
  "xpk",
  "xqt",
  "xga",
  "xpt",
  "xlt",
  "xfl",
  "xqt",
  "xkg",
  "xg",
  "xgr",
  "wrapper",
  "tape",
  "prebook",
  "special",
  "intown",
  "outoftown",
  "layer",
  "mesh",
  "united",
  "earth",
  "cal",
  "variegated",
  "bin",
  "ralphs",
  "fview",
  "tops",
  "christopher",
  "mex",
  "bags",
  "standard",
  // "super",
]);

export const ACCEPTED_BIGRAMS = new Set(["yu"]);

export const PRODUCT_STRING_REPLACEMENTS = new Map<string, string>([
  // ["blueberries", "blueberry"], ["raspberries", "raspberry"],  //  could just handle with regex or replace on berries
  ["macintosh", "mcintosh"],
  ["apples", "apple"],
  ["celery root", "celeriac"],
  ["pink ladies", "pink lady"],
  ["costa rica", ""],
  ["red bor", "redbor"],
  ["ylw", "yellow"],
  ["grn", "green"],
  ["blk", "black"],
  ["bby", "baby"],
  ["swts", "sweet"],
  ["swts", "sweet"],
  ["orntal ", "oriental"],
  // ["shred", "shredded"],
  // ["dr", "dried"],
]);

const PRODUCT_NAME_REPLACEMENTS = new Map<string, string>([
  ["shred", "shredded"],
  ["dr", "dried"],
]);

export const processProductName = (word: string): string => {
  // Remove any non-alphabetical characters, return the result in lowercase while preserving whitespace
  let processedName = word
    .replace(/[^a-zA-Z\s]/g, "")
    .replace("  ", " ")
    .toLowerCase();
  for (const [key, value] of PRODUCT_STRING_REPLACEMENTS) {
    processedName = processedName.replace(key, value);
  }
  for (const word of processedName.split(" ")) {
    if (PRODUCT_NAME_REPLACEMENTS.has(word)) {
      processedName = processedName.replace(
        word,
        PRODUCT_NAME_REPLACEMENTS.get(word) || "",
      );
    }
  }
  return processedName;
};

export const removeProductStopwords = (productName: string): string => {
  // Split the product name into words
  const words = productName.split(" ");
  // Filter out the stopwords
  const filteredWords = words.filter(
    (word) =>
      !PRODUCT_STOPWORDS.has(word) &&
      (word.length > 2 || ACCEPTED_BIGRAMS.has(word)),
  );
  // Join the words back together
  return filteredWords.join(" ");
};

export const removeTrailingS = (productName: string): string => {
  // Split the product name into words, remove any trailing "s" characters as long as it's not a double "s", e.g. "Hass"
  const words = productName
    .split(" ")
    .map((word) => word.replace(/(?<!s)s$/, ""));
  return words.join(" ");
};

const unitMeasureMap: Record<string, string> = {
  ea: "unit",
  each: "unit",
  ct: "unit",
  cts: "unit",
  count: "unit",
  counts: "unit",
  cnt: "unit",
  cnts: "unit",
  pint: "pt",
  pt: "pt",
  gal: "gal",
  gals: "gal",
  gall: "gal",
  galls: "gal",
  gallon: "gal",
  gallons: "gal",
  oz: "oz",
  ozs: "oz",
  ounce: "oz",
  ounces: "oz",
  "#": "lb",
  lb: "lb",
  lbs: "lb",
  pound: "lb",
  pounds: "lb",
  g: "g",
  gr: "g",
  grs: "g",
  grm: "g",
  grms: "g",
  gs: "g",
  gram: "g",
  grams: "g",
  gramme: "g",
  grammes: "g",
  kg: "kg",
  kgs: "kg",
  kilo: "kg",
  kilogram: "kg",
  kilograms: "kg",
  kilogramme: "kg",
  kilogrammes: "kg",
  milligram: "mg",
  milligrams: "mg",
  milligramme: "mg",
  milligrammes: "mg",
  liter: "l",
  litre: "l",
  litres: "l",
  liters: "l",
  "litre(s)": "l",
  "litres(s)": "l",
  millilitre: "ml",
  millilitres: "ml",
  milliliter: "ml",
  milliliters: "ml",
  ml: "ml",
  mls: "ml",
};

const BUNCH_UNIT_WORDS = new Set([
  "bunch",
  "bunches",
  "bunched",
  "bunch(es)",
  "bunch(s)",
]);

const matchUnitMeasure = (unit: string): string | null => {
  return unitMeasureMap[unit.toLowerCase()] || unit;
};

const findUnitLabel = (description: string): string => {
  let label = "";
  for (const word of description.toLowerCase().split(" ")) {
    if (
      word === "bag" ||
      word === "bagged" ||
      word === "bags" ||
      word === "pouch" ||
      word === "pouches"
    ) {
      label = "bag";
      break;
    }
    if (word === "can") {
      label = "can";
      break;
    }
    if (word === "cup") {
      label = "cup";
      break;
    }
    if (word === "jar") {
      label = "jar";
      break;
    }
    if (word === "packet") {
      label = "packet";
      break;
    }
    if (BUNCH_UNIT_WORDS.has(word)) {
      label = "bunch";
      break;
    }
  }
  return label;
};

const detectCaseUnit = (
  description: string,
): {
  quantity: number;
  unitQuantity: number;
  totalQuantity: number;
  unit: string;
  label: string;
  isCaseUnit: boolean;
} | null => {
  // Define a regular expression to match the pattern "12x3#"
  const regex = /(\d+)x(\d+)([a-zA-Z#]+)/;
  const match = description.match(regex);

  if (match) {
    // find potential unit label for sub unit
    const label = findUnitLabel(description);
    const quantity = parseInt(match[1], 10);
    const unitQuantity = parseInt(match[2], 10);
    const unit = matchUnitMeasure(match[3].toLowerCase());

    return {
      totalQuantity: quantity * unitQuantity,
      quantity,
      unitQuantity,
      unit: unit || match[3].toLowerCase() || "",
      label,
      isCaseUnit: true,
    };
  }
  return null;
};

const detectIndividualUnit = (
  description: string,
): {
  unitQuantity: number;
  unit: string;
  label: string;
} | null => {
  // Define a regular expression pattern to match the pattern "88/100ct"
  const fruitSizeRegex = /(\d+)(\/)(\d+)([a-zA-Z#]+)/;
  const fruitSizeMatch = description.match(fruitSizeRegex);
  if (fruitSizeMatch) {
    // If unit info is found in the format "88/100CT" - return null immediately - these are size estimates for fruit cases
    return null;
  }
  // Define a regular expression pattern to match the pattern "12ct"
  const regex = /(\d+)([a-zA-Z#]+)/;
  const match = description.match(regex);
  if (match) {
    const unitQuantity = parseInt(match[1], 10);
    const unit = matchUnitMeasure(match[2].toLowerCase());
    const label = findUnitLabel(description);

    return {
      unitQuantity,
      unit: unit || match[2].toLowerCase() || "",
      label,
    };
  }
  return null;
};

export const detectUnits = (
  description: string,
): {
  quantity: number;
  unitQuantity: number;
  totalQuantity: number;
  unit: string;
  label: string;
  isCaseUnit: boolean;
} | null => {
  const caseUnitMatch = detectCaseUnit(description);
  if (!caseUnitMatch) {
    const individualUnitMatch = detectIndividualUnit(description);
    if (!individualUnitMatch) {
      return null;
    }
    return {
      ...individualUnitMatch,
      quantity: 1,
      unitQuantity: individualUnitMatch?.unitQuantity || 0,
      totalQuantity: individualUnitMatch?.unitQuantity || 0,
      label: individualUnitMatch?.label || "",
      isCaseUnit: false,
    };
  }

  return caseUnitMatch;
};
