1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
5 // Note: the file data_test.go that is generated should not be checked in.
6 //go:generate go run maketables.go triegen.go
7 //go:generate go test -tags test
9 // Package norm contains types and functions for normalizing Unicode strings.
10 package norm // import "golang.org/x/text/unicode/norm"
15 "golang.org/x/text/transform"
18 // A Form denotes a canonical representation of Unicode code points.
19 // The Unicode-defined normalization and equivalence forms are:
21 // NFC Unicode Normalization Form C
22 // NFD Unicode Normalization Form D
23 // NFKC Unicode Normalization Form KC
24 // NFKD Unicode Normalization Form KD
26 // For a Form f, this documentation uses the notation f(x) to mean
27 // the bytes or string x converted to the given form.
28 // A position n in x is called a boundary if conversion to the form can
29 // proceed independently on both sides:
30 // f(x) == append(f(x[0:n]), f(x[n:])...)
32 // References: https://unicode.org/reports/tr15/ and
33 // https://unicode.org/notes/tn5/.
43 // Bytes returns f(b). May return b if f(b) = b.
44 func (f Form) Bytes(b []byte) []byte {
47 n, ok := ft.quickSpan(src, 0, len(b), true)
51 out := make([]byte, n, len(b))
53 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
54 return doAppendInner(&rb, n)
57 // String returns f(s).
58 func (f Form) String(s string) string {
61 n, ok := ft.quickSpan(src, 0, len(s), true)
65 out := make([]byte, n, len(s))
67 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
68 return string(doAppendInner(&rb, n))
71 // IsNormal returns true if b == f(b).
72 func (f Form) IsNormal(b []byte) bool {
75 bp, ok := ft.quickSpan(src, 0, len(b), true)
79 rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
80 rb.setFlusher(nil, cmpNormalBytes)
83 if bp = decomposeSegment(&rb, bp, true); bp < 0 {
86 bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
91 func cmpNormalBytes(rb *reorderBuffer) bool {
93 for i := 0; i < rb.nrune; i++ {
95 if int(info.size) > len(b) {
101 if b[0] != rb.byte[p] {
110 // IsNormalString returns true if s == f(s).
111 func (f Form) IsNormalString(s string) bool {
112 src := inputString(s)
114 bp, ok := ft.quickSpan(src, 0, len(s), true)
118 rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
119 rb.setFlusher(nil, func(rb *reorderBuffer) bool {
120 for i := 0; i < rb.nrune; i++ {
122 if bp+int(info.size) > len(s) {
128 if s[bp] != rb.byte[p] {
137 if bp = decomposeSegment(&rb, bp, true); bp < 0 {
140 bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
145 // patchTail fixes a case where a rune may be incorrectly normalized
146 // if it is followed by illegal continuation bytes. It returns the
147 // patched buffer and whether the decomposition is still in progress.
148 func patchTail(rb *reorderBuffer) bool {
149 info, p := lastRuneStart(&rb.f, rb.out)
150 if p == -1 || info.size == 0 {
153 end := p + int(info.size)
154 extra := len(rb.out) - end
156 // Potentially allocating memory. However, this only
157 // happens with ill-formed UTF-8.
159 x = append(x, rb.out[len(rb.out)-extra:]...)
160 rb.out = rb.out[:end]
161 decomposeToLastBoundary(rb)
163 rb.out = append(rb.out, x...)
168 decomposeToLastBoundary(rb)
169 if s := rb.ss.next(info); s == ssStarter {
172 } else if s == ssOverflow {
177 rb.insertUnsafe(inputBytes(buf), 0, info)
181 func appendQuick(rb *reorderBuffer, i int) int {
185 end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
186 rb.out = rb.src.appendSlice(rb.out, i, end)
190 // Append returns f(append(out, b...)).
191 // The buffer out must be nil, empty, or equal to f(out).
192 func (f Form) Append(out []byte, src ...byte) []byte {
193 return f.doAppend(out, inputBytes(src), len(src))
196 func (f Form) doAppend(out []byte, src input, n int) []byte {
201 // Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
203 p, _ := ft.quickSpan(src, 0, n, true)
204 out = src.appendSlice(out, 0, p)
208 rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
209 return doAppendInner(&rb, p)
211 rb := reorderBuffer{f: *ft, src: src, nsrc: n}
212 return doAppend(&rb, out, 0)
215 func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
216 rb.setFlusher(out, appendFlush)
217 src, n := rb.src, rb.nsrc
218 doMerge := len(out) > 0
219 if q := src.skipContinuationBytes(p); q > p {
220 // Move leading non-starters to destination.
221 rb.out = src.appendSlice(rb.out, p, q)
223 doMerge = patchTail(rb)
229 info = fd.info(src, p)
230 if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
232 decomposeToLastBoundary(rb)
234 p = decomposeSegment(rb, p, true)
239 // Append incomplete UTF-8 encoding.
240 return src.appendSlice(rb.out, p, n)
243 return doAppendInner(rb, p)
246 p = appendQuick(rb, p)
247 return doAppendInner(rb, p)
250 func doAppendInner(rb *reorderBuffer, p int) []byte {
251 for n := rb.nsrc; p < n; {
252 p = decomposeSegment(rb, p, true)
253 p = appendQuick(rb, p)
258 // AppendString returns f(append(out, []byte(s))).
259 // The buffer out must be nil, empty, or equal to f(out).
260 func (f Form) AppendString(out []byte, src string) []byte {
261 return f.doAppend(out, inputString(src), len(src))
264 // QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
265 // It is not guaranteed to return the largest such n.
266 func (f Form) QuickSpan(b []byte) int {
267 n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
271 // Span implements transform.SpanningTransformer. It returns a boundary n such
272 // that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
273 func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
274 n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
277 err = transform.ErrEndOfSpan
279 err = transform.ErrShortSrc
285 // SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
286 // It is not guaranteed to return the largest such n.
287 func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
288 n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
291 err = transform.ErrEndOfSpan
293 err = transform.ErrShortSrc
299 // quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
300 // whether any non-normalized parts were found. If atEOF is false, n will
301 // not point past the last segment if this segment might be become
302 // non-normalized by appending other runes.
303 func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
307 for n = end; i < n; {
308 if j := src.skipASCII(i, n); i != j {
315 info := f.info(src, i)
318 // include incomplete runes
321 return lastSegStart, true
323 // This block needs to be before the next, because it is possible to
324 // have an overflow for runes that are starters (e.g. with U+FF9E).
325 switch ss.next(info) {
329 return lastSegStart, false
331 if lastCC > info.ccc {
332 return lastSegStart, false
353 return lastSegStart, false
356 // QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
357 // It is not guaranteed to return the largest such n.
358 func (f Form) QuickSpanString(s string) int {
359 n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
363 // FirstBoundary returns the position i of the first boundary in b
364 // or -1 if b contains no boundary.
365 func (f Form) FirstBoundary(b []byte) int {
366 return f.firstBoundary(inputBytes(b), len(b))
369 func (f Form) firstBoundary(src input, nsrc int) int {
370 i := src.skipContinuationBytes(0)
376 // We should call ss.first here, but we can't as the first rune is
377 // skipped already. This means FirstBoundary can't really determine
378 // CGJ insertion points correctly. Luckily it doesn't have to.
380 info := fd.info(src, i)
384 if s := ss.next(info); s != ssSuccess {
389 if !info.BoundaryAfter() && !ss.isMax() {
397 // FirstBoundaryInString returns the position i of the first boundary in s
398 // or -1 if s contains no boundary.
399 func (f Form) FirstBoundaryInString(s string) int {
400 return f.firstBoundary(inputString(s), len(s))
403 // NextBoundary reports the index of the boundary between the first and next
404 // segment in b or -1 if atEOF is false and there are not enough bytes to
405 // determine this boundary.
406 func (f Form) NextBoundary(b []byte, atEOF bool) int {
407 return f.nextBoundary(inputBytes(b), len(b), atEOF)
410 // NextBoundaryInString reports the index of the boundary between the first and
411 // next segment in b or -1 if atEOF is false and there are not enough bytes to
412 // determine this boundary.
413 func (f Form) NextBoundaryInString(s string, atEOF bool) int {
414 return f.nextBoundary(inputString(s), len(s), atEOF)
417 func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
425 info := fd.info(src, 0)
435 for i := int(info.size); i < nsrc; i += int(info.size) {
436 info = fd.info(src, i)
443 // TODO: Using streamSafe to determine the boundary isn't the same as
444 // using BoundaryBefore. Determine which should be used.
445 if s := ss.next(info); s != ssSuccess {
449 if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
455 // LastBoundary returns the position i of the last boundary in b
456 // or -1 if b contains no boundary.
457 func (f Form) LastBoundary(b []byte) int {
458 return lastBoundary(formTable[f], b)
461 func lastBoundary(fd *formInfo, b []byte) int {
463 info, p := lastRuneStart(fd, b)
467 if info.size == 0 { // ends with incomplete rune
468 if p == 0 { // starts with incomplete rune
472 info, p = lastRuneStart(fd, b[:i])
473 if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
477 if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
480 if info.BoundaryAfter() {
484 v := ss.backwards(info)
485 for i = p; i >= 0 && v != ssStarter; i = p {
486 info, p = lastRuneStart(fd, b[:i])
487 if v = ss.backwards(info); v == ssOverflow {
490 if p+int(info.size) != i {
491 if p == -1 { // no boundary found
494 return i // boundary after an illegal UTF-8 encoding
500 // decomposeSegment scans the first segment in src into rb. It inserts 0x034f
501 // (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
502 // and returns the number of bytes consumed from src or iShortDst or iShortSrc.
503 func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
504 // Force one character to be consumed.
505 info := rb.f.info(rb.src, sp)
509 if s := rb.ss.next(info); s == ssStarter {
510 // TODO: this could be removed if we don't support merging.
514 } else if s == ssOverflow {
518 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
524 if !atEOF && !info.BoundaryAfter() {
525 return int(iShortSrc)
529 info = rb.f.info(rb.src, sp)
532 return int(iShortSrc)
536 if s := rb.ss.next(info); s == ssStarter {
538 } else if s == ssOverflow {
542 if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
548 return int(iShortDst)
553 // lastRuneStart returns the runeInfo and position of the last
554 // rune in buf or the zero runeInfo and -1 if no rune was found.
555 func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
557 for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
560 return Properties{}, -1
562 return fd.info(inputBytes(buf), p), p
565 // decomposeToLastBoundary finds an open segment at the end of the buffer
566 // and scans it into rb. Returns the buffer minus the last segment.
567 func decomposeToLastBoundary(rb *reorderBuffer) {
569 info, i := lastRuneStart(fd, rb.out)
570 if int(info.size) != len(rb.out)-i {
571 // illegal trailing continuation bytes
574 if info.BoundaryAfter() {
577 var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
583 v := ss.backwards(info)
585 // Note that if we have an overflow, it the string we are appending to
586 // is not correctly normalized. In this case the behavior is undefined.
591 if v == ssStarter || p < 0 {
594 info, i = lastRuneStart(fd, rb.out[:p])
595 if int(info.size) != p-i {
600 // Copy bytes for insertion as we may need to overwrite rb.out.
601 var buf [maxBufferSize * utf8.UTFMax]byte
602 cp := buf[:copy(buf[:], rb.out[p:])]
604 for padd--; padd >= 0; padd-- {
606 rb.insertUnsafe(inputBytes(cp), 0, info)