mirror of
https://github.com/ii64/sonic.git
synced 2026-06-20 16:45:22 +08:00
138 lines
4.3 KiB
Go
138 lines
4.3 KiB
Go
/*
|
||
* Copyright 2022 ByteDance Inc.
|
||
*
|
||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||
* you may not use this file except in compliance with the License.
|
||
* You may obtain a copy of the License at
|
||
*
|
||
* http://www.apache.org/licenses/LICENSE-2.0
|
||
*
|
||
* Unless required by applicable law or agreed to in writing, software
|
||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
* See the License for the specific language governing permissions and
|
||
* limitations under the License.
|
||
*/
|
||
|
||
package utf8
|
||
|
||
import (
|
||
`testing`
|
||
`strings`
|
||
`github.com/stretchr/testify/assert`
|
||
`unicode/utf8`
|
||
`bytes`
|
||
`math/rand`
|
||
)
|
||
|
||
var (
|
||
_Header_2Bytes = string([]byte{0xC0})
|
||
_Header_3Bytes = string([]byte{0xE0})
|
||
_Header_4Bytes = string([]byte{0xF0})
|
||
_Low_Surrogate = string([]byte{0xED, 0xA0, 0x80}) // \ud800
|
||
_High_Surrogate = string([]byte{0xED, 0xB0, 0x80}) // \udc00
|
||
_Cont = "\xb0"
|
||
)
|
||
|
||
func TestCorrectWith_InvalidUtf8(t *testing.T) {
|
||
var tests = []struct {
|
||
name string
|
||
input string
|
||
expect string
|
||
errpos int
|
||
} {
|
||
{"basic", `abc`, "abc", -1},
|
||
{"long", strings.Repeat("helloα,景😊", 1000), strings.Repeat("helloα,景😊", 1000), -1},
|
||
|
||
// invalid utf8 - single byte
|
||
{"single_Cont", _Cont, "\ufffd", 0},
|
||
{"single_Header_2Bytes", _Header_2Bytes, "\ufffd", 0},
|
||
{"single_Header_3Bytes", _Header_3Bytes, "\ufffd", 0},
|
||
{"single_Header_4Bytes", _Header_4Bytes, "\ufffd", 0},
|
||
|
||
// invalid utf8 - two bytes
|
||
{"two_Header_2Bytes + _Cont", _Header_2Bytes + _Cont, "\ufffd\ufffd", 0},
|
||
{`two_Header_4Bytes + _Cont+ "xx"`, _Header_4Bytes + _Cont + "xx", "\ufffd\ufffdxx", 0},
|
||
{ `"xx" + three_Header_4Bytes + _Cont + _Cont`, "xx" + _Header_4Bytes + _Cont + _Cont, "xx\ufffd\ufffd\ufffd", 2},
|
||
|
||
// invalid utf8 - three bytes
|
||
{`three_Low_Surrogate`, _Low_Surrogate, "\ufffd\ufffd\ufffd", 0},
|
||
{`three__High_Surrogate`, _High_Surrogate, "\ufffd\ufffd\ufffd", 0},
|
||
|
||
// invalid utf8 - multi bytes
|
||
{`_High_Surrogate + _Low_Surrogate`, _High_Surrogate + _Low_Surrogate, "\ufffd\ufffd\ufffd\ufffd\ufffd\ufffd", 0},
|
||
{`"\x80\x80\x80\x80"`, "\x80\x80\x80\x80", "\ufffd\ufffd\ufffd\ufffd", 0},
|
||
}
|
||
for _, test := range tests {
|
||
got := CorrectWith(nil, []byte(test.input), "\ufffd")
|
||
assert.Equal(t, []byte(test.expect), got, test.name)
|
||
assert.Equal(t,test.errpos == -1, utf8.ValidString(test.input), test.name)
|
||
}
|
||
}
|
||
|
||
func genRandBytes(length int) []byte {
|
||
var buf bytes.Buffer
|
||
for j := 0; j < length; j++ {
|
||
buf.WriteByte(byte(rand.Intn(0xFF + 1)))
|
||
}
|
||
return buf.Bytes()
|
||
}
|
||
|
||
func genRandAscii(length int) []byte {
|
||
var buf bytes.Buffer
|
||
for j := 0; j < length; j++ {
|
||
buf.WriteByte(byte(rand.Intn(0x7F + 1)))
|
||
}
|
||
return buf.Bytes()
|
||
}
|
||
|
||
func genRandRune(length int) []byte {
|
||
var buf bytes.Buffer
|
||
for j := 0; j < length; j++ {
|
||
buf.WriteRune(rune(rand.Intn(0x10FFFF + 1)))
|
||
}
|
||
return buf.Bytes()
|
||
}
|
||
|
||
func TestValidate_Random(t *testing.T) {
|
||
// compare with stdlib
|
||
compare := func(t *testing.T, data []byte) {
|
||
assert.Equal(t, utf8.Valid(data), Validate(data), string(data))
|
||
}
|
||
|
||
// random testing
|
||
nums := 1000
|
||
maxLen := 1000
|
||
for i := 0; i < nums; i++ {
|
||
length := rand.Intn(maxLen)
|
||
compare(t, genRandBytes(length))
|
||
compare(t, genRandRune(length))
|
||
}
|
||
}
|
||
|
||
func BenchmarkValidate(b *testing.B) {
|
||
bench := []struct {
|
||
name string
|
||
data []byte
|
||
} {
|
||
{"ValidAscii", genRandAscii(1000)},
|
||
{"ValidUTF8", genRandRune(1000)},
|
||
{"RandomBytes", genRandBytes(1000)},
|
||
}
|
||
|
||
for _, test := range bench {
|
||
if utf8.Valid(test.data) != Validate(test.data) {
|
||
b.Fatalf("sonic utf8 validate wrong for %s string: %v", test.name, test.data)
|
||
}
|
||
b.Run("Sonic_" + test.name, func(b *testing.B) {
|
||
for i := 0; i < b.N; i++ {
|
||
Validate(test.data)
|
||
}
|
||
})
|
||
b.Run("StdLib_" + test.name, func(b *testing.B) {
|
||
for i := 0; i < b.N; i++ {
|
||
utf8.Valid(test.data)
|
||
}
|
||
})
|
||
}
|
||
}
|