/*
 * Copyright (c) 2026 Dylan Araps
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
 * in the Software without restriction, including without limitation the rights
 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 * copies of the Software, and to permit persons to whom the Software is
 * furnished to do so, subject to the following conditions:
 *
 * The above copyright notice and this permission notice shall be included in all
 * copies or substantial portions of the Software.
 *
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 * SOFTWARE.
 */
#ifndef DYLAN_UTF8_H
#define DYLAN_UTF8_H

#include "util.h"

static inline usize
utf8_expected(u8 b)
{
  static const u8 L[] = {
    1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0,2,2,2,2,3,3,4,0
  };
  return L[b >> 3];
}

static inline usize
utf8_width(u32 c)
{
  if (c == 0) return 0;

  // Control.
  if (c < 0x20) return 0;
  if (c >= 0x7f && c < 0xa0) return 0;

  // Zero width joiner.
  if (c == 0x200d) return 0;

  // Combining.
  if ((c >= 0x0300 && c <= 0x036f) ||
      (c >= 0x1ab0 && c <= 0x1aff) ||
      (c >= 0x1dc0 && c <= 0x1dff) ||
      (c >= 0x20d0 && c <= 0x20ff) ||
      (c >= 0xfe20 && c <= 0xfe2f) ||
      (c >= 0xe0100 && c <= 0xe01ef))
    return 0;

  // Variation selectors.
  if ((c >= 0xfe00 && c <= 0xfe0f))
    return 0;

  // Emoji modifiers.
  if (c >= 0x1f3fb && c <= 0x1f3ff)
    return 0;

  // East asian wide.
  if ((c >= 0x1100 && c <= 0x115f) ||
      c == 0x2329 || c == 0x232a ||
      (c >= 0x2e80 && c <= 0xa4cf && c != 0x303f) ||
      (c >= 0xac00 && c <= 0xd7a3) ||
      (c >= 0xf900 && c <= 0xfaff) ||
      (c >= 0xfe10 && c <= 0xfe19) ||
      (c >= 0xfe30 && c <= 0xfe6f) ||
      (c >= 0xff00 && c <= 0xff60) ||
      (c >= 0xffe0 && c <= 0xffe6) ||
      (c >= 0x20000 && c <= 0x2fffd) ||
      (c >= 0x30000 && c <= 0x3fffd))
    return 2;

  // Emoji block.
  if ((c >= 0x1f300 && c <= 0x1faff) ||
      (c >= 0x2600 && c <= 0x27bf) ||
      (c >= 0x2b50 && c <= 0x2b55))
    return 2;

  return 1;
}

//
// Branchless UTF8 decoder by Skeeto.
// Source: https://nullprogram.com/blog/2017/10/06/
//
static inline void *
utf8_decode(void *b, u32 *c)
{
  unsigned char *s = (unsigned char *)b;
  usize l = utf8_expected(s[0]);
  static const int m[] = {0x00, 0x7f, 0x1f, 0x0f, 0x07};
  static const int shc[] = {0, 18, 12, 6, 0};
  *c  = (u32)(s[0] & m[l]) << 18;
  *c |= (u32)(s[1] & 0x3f) << 12;
  *c |= (u32)(s[2] & 0x3f) <<  6;
  *c |= (u32)(s[3] & 0x3f);
  *c >>= shc[l];
  return s + l + !l;
}

static void *
utf8_decode_untrusted(void *b, u32 *c, int *e)
{
  static const u32 mi[] = {4194304, 0, 128, 2048, 65536};
  static const int she[] = {0, 6, 4, 2, 0};
  unsigned char *s = (unsigned char *)b;
  unsigned char *n = utf8_decode(b, c);
  usize l = utf8_expected(s[0]);
  *e  = (*c < mi[l]) << 6;          // Non-canonical encoding.
  *e |= ((*c >> 11) == 0x1b) << 7;  // Surrogate half?
  *e |= (*c > 0x10FFFF) << 8;       // Out of range?
  *e |= (s[1] & 0xc0) >> 2;
  *e |= (s[2] & 0xc0) >> 4;
  *e |= (s[3])        >> 6;
  *e ^= 0x2a;                       // Top two bits of each tail byte correct?
  *e >>= she[l];
  return n;
}

static inline usize
utf8_decode_rev(const unsigned char *s, usize x, u32 *c)
{
  usize i = x;
  while (i > 0 && (s[i - 1] & 0xc0) == 0x80) i--;
  if (i > 0) i--;
  usize l = x - i;
  utf8_decode((void *)(s + i), c);
  return l;
}

static inline usize
utf8_cols(const void *s, usize l, usize *lw)
{
  usize w = 0;
  const unsigned char *p = (const unsigned char *)s;
  const unsigned char *e = p + l;
  *lw = 0;
  while (p < e) {
    u32 cp;
    p = utf8_decode((void *)p, &cp);
    *lw = utf8_width(cp);
    w += *lw;
  }
  return w;
}

static inline usize
utf8_trunc(const char *s, usize l, usize c, usize *oc)
{
  const unsigned char *p = (const unsigned char *)s;
  const unsigned char *e = p + l;
  usize co = 0;
  while (p < e) {
    u32 cp;
    const unsigned char *n = utf8_decode((void *)p, &cp);
    usize w = utf8_width(cp);
    if (w > c - co) break;
    co += w;
    p = n;
  }
  *oc = co;
  return (usize)(p - (const unsigned char *)s);
}

#endif // DYLAN_UTF8_H