package ucd

import (
	"bufio"
	"encoding/binary"
	"encoding/hex"
	"io"
	"regexp"
	"strings"
)

type CodePointRange struct {
	From rune
	To   rune
}

type UnicodeData struct {
	GeneralCategory map[string][]*CodePointRange
}

func ParseUnicodeData(r io.Reader, propValAliases *PropertyValueAliases) (*UnicodeData, error) {
	gc2CPRange := map[string][]*CodePointRange{}
	lastCPTo := rune(-1)
	p := newParser(r)
	for p.parse() {
		if len(p.fields) == 0 {
			continue
		}
		cpFrom, cpTo, err := parseCodePointRange(p.fields[0])
		if err != nil {
			return nil, err
		}
		if cpFrom-lastCPTo > 1 {
			defaultGCVal := propValAliases.GeneralCategoryDefaultValue
			gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
				From: lastCPTo + 1,
				To:   cpFrom - 1,
			})
		}
		lastCPTo = cpTo
		gc := NormalizeSymbolicValue(p.fields[2])
		rs, ok := gc2CPRange[gc]
		if ok {
			r := rs[len(rs)-1]
			if cpFrom-r.To == 1 {
				r.To = cpTo
			} else {
				gc2CPRange[gc] = append(rs, &CodePointRange{
					From: cpFrom,
					To:   cpTo,
				})
			}
		} else {
			gc2CPRange[gc] = []*CodePointRange{
				{
					From: cpFrom,
					To:   cpTo,
				},
			}
		}
	}
	if p.err != nil {
		return nil, p.err
	}
	if lastCPTo < propValAliases.GeneralCategoryDefaultRange.To {
		defaultGCVal := propValAliases.GeneralCategoryDefaultValue
		gc2CPRange[defaultGCVal] = append(gc2CPRange[defaultGCVal], &CodePointRange{
			From: lastCPTo + 1,
			To:   propValAliases.GeneralCategoryDefaultRange.To,
		})
	}
	return &UnicodeData{
		GeneralCategory: gc2CPRange,
	}, nil
}

type PropertyValueAliases struct {
	GeneralCategory             map[string]string
	GeneralCategoryDefaultRange *CodePointRange
	GeneralCategoryDefaultValue string
}

func ParsePropertyValueAliases(r io.Reader) (*PropertyValueAliases, error) {
	catName2Abbs := map[string]string{}
	var defaultGCCPRange *CodePointRange
	var defaultGCVal string
	p := newParser(r)
	for p.parse() {
		if len(p.fields) > 0 && p.fields[0] == "gc" {
			catNameShort := NormalizeSymbolicValue(p.fields[1])
			catNameLong := NormalizeSymbolicValue(p.fields[2])
			catName2Abbs[catNameShort] = catNameShort
			catName2Abbs[catNameLong] = catNameShort
			for _, f := range p.fields[3:] {
				catNameOther := NormalizeSymbolicValue(f)
				catName2Abbs[catNameOther] = catNameShort
			}
		}
		if len(p.defaultFields) > 0 && p.defaultFields[1] == "General_Category" {
			cpFrom, cpTo, err := parseCodePointRange(p.defaultFields[0])
			if err != nil {
				return nil, err
			}
			defaultGCCPRange = &CodePointRange{
				From: cpFrom,
				To:   cpTo,
			}
			defaultGCVal = NormalizeSymbolicValue(p.defaultFields[2])
		}
	}
	if p.err != nil {
		return nil, p.err
	}
	return &PropertyValueAliases{
		GeneralCategory:             catName2Abbs,
		GeneralCategoryDefaultRange: defaultGCCPRange,
		GeneralCategoryDefaultValue: defaultGCVal,
	}, nil
}

var symValReplacer = strings.NewReplacer("_", "", "-", "", "\x20", "")

func NormalizeSymbolicValue(original string) string {
	strings.Trim("", "")
	v := strings.ToLower(symValReplacer.Replace(original))
	if strings.HasPrefix(v, "is") && v != "is" {
		return v[3:]
	}
	return v
}

type Fields []string

var (
	reLine           = regexp.MustCompile(`^\s*(.*?)\s*(#.*)?$`)
	reCodePointRange = regexp.MustCompile(`^([[:xdigit:]]+)(?:..([[:xdigit:]]+))?$`)

	specialCommentPrefix = "# @missing:"
)

// This parser can parse data files of Unicode Character Database (UCD).
// Specifically, it has the following two functions:
// - Converts each line of the data files into a slice of fields.
// - Recognizes specially-formatted comments starting `@missing` and generates a slice of fields.
//
// However, for practical purposes, each field needs to be analyzed more specifically.
// For instance, in UnicodeData.txt, the first field represents a range of code points,
// so it needs to be recognized as a hexadecimal string.
// You can perform more specific parsing for each file by implementing a dedicated parser that wraps this parser.
//
// https://www.unicode.org/reports/tr44/#Format_Conventions
type parser struct {
	scanner       *bufio.Scanner
	fields        Fields
	defaultFields Fields
	err           error
}

func newParser(r io.Reader) *parser {
	return &parser{
		scanner: bufio.NewScanner(r),
	}
}

func (p *parser) parse() bool {
	for p.scanner.Scan() {
		p.fields, p.defaultFields, p.err = parseRecord(p.scanner.Text())
		if p.err != nil {
			return false
		}
		if p.fields != nil || p.defaultFields != nil {
			return true
		}
	}
	p.err = p.scanner.Err()
	return false
}

func parseRecord(src string) (Fields, Fields, error) {
	ms := reLine.FindStringSubmatch(src)
	fields := ms[1]
	comment := ms[2]
	var fs Fields
	if fields != "" {
		fs = parseFields(fields)
	}
	var defaultFs Fields
	if strings.HasPrefix(comment, specialCommentPrefix) {
		fields := strings.Replace(comment, specialCommentPrefix, "", -1)
		fs := parseFields(fields)
		defaultFs = fs
	}
	return fs, defaultFs, nil
}

func parseFields(src string) Fields {
	var fields Fields
	for _, f := range strings.Split(src, ";") {
		fields = append(fields, strings.TrimSpace(f))
	}
	return fields
}

func parseCodePointRange(src string) (rune, rune, error) {
	var from, to rune
	var err error
	cp := reCodePointRange.FindStringSubmatch(src)
	from, err = decodeHexToRune(cp[1])
	if err != nil {
		return 0, 0, err
	}
	if cp[2] != "" {
		to, err = decodeHexToRune(cp[2])
		if err != nil {
			return 0, 0, err
		}
	} else {
		to = from
	}
	return from, to, nil
}

func decodeHexToRune(hexCodePoint string) (rune, error) {
	h := hexCodePoint
	if len(h)%2 != 0 {
		h = "0" + h
	}
	b, err := hex.DecodeString(h)
	if err != nil {
		return 0, err
	}
	l := len(b)
	for i := 0; i < 4-l; i++ {
		b = append([]byte{0}, b...)
	}
	n := binary.BigEndian.Uint32(b)
	return rune(n), nil
}