src/util/support/t_utf8.c


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209

/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* util/support/t_utf8.c - test UTF-8 boundary conditions */
/*
 * Copyright (C) 2015 by the Massachusetts Institute of Technology.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 *
 * * Redistributions of source code must retain the above copyright
 *   notice, this list of conditions and the following disclaimer.
 *
 * * Redistributions in binary form must reproduce the above copyright
 *   notice, this list of conditions and the following disclaimer in
 *   the documentation and/or other materials provided with the
 *   distribution.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
 * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
 * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
 * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
 * OF THE POSSIBILITY OF SUCH DAMAGE.
 */

#include <stdio.h>
#include <string.h>

#include "k5-platform.h"
#include "k5-utf8.h"

/*
 * Convenience macro to allow testing of old encodings.
 *
 * "Old" means ISO/IEC 10646 prior to 2011, when the highest valid code point
 * was U+7FFFFFFF instead of U+10FFFF.
 */
#ifdef OLDENCODINGS
#define L(x) (x)
#else
#define L(x) 0
#endif

/*
 * len is 0 for invalid encoding prefixes (krb5int_utf8_charlen2() partially
 * enforces the validity of the first two bytes, based on masking the second
 * byte.  It doesn't check whether bit 6 is 0, though, and doesn't catch the
 * range between U+110000 and U+13FFFF).
 *
 * ucs is 0 for invalid encodings (including ones with valid prefixes according
 * to krb5int_utf8_charlen2(); krb5int_utf8_to_ucs4() will still fail on them
 * because it checks more things.)  Code points above U+10FFFF are excluded by
 * the actual test code and remain in the table for possibly testing the old
 * implementation that didn't exclude them.
 *
 * Neither krb5int_ucs4_to_utf8() nor krb5int_utf8_to_ucs4() excludes the
 * surrogate pair range.
 */
struct testcase {
    const char *p;
    krb5_ucs4 ucs;
    int len;
} testcases[] = {
    { "\x7f", 0x0000007f, 1 },             /* Lowest 1-byte encoding */
    { "\xc0\x80", 0x00000000, 0 },         /* Invalid 2-byte encoding */
    { "\xc2\x80", 0x00000080, 2 },         /* Lowest valid 2-byte encoding */
    { "\xdf\xbf", 0x000007ff, 2 },         /* Highest valid 2-byte encoding*/
    { "\xdf\xff", 0x00000000, 2 },         /* Invalid 2-byte encoding*/
    { "\xe0\x80\x80", 0x00000000, 0 },     /* Invalid 3-byte encoding */
    { "\xe0\xa0\x80", 0x00000800, 3 },     /* Lowest valid 3-byte encoding */
    { "\xef\xbf\xbf", 0x0000ffff, 3 },     /* Highest valid 3-byte encoding */
    { "\xef\xff\xff", 0x00000000, 3 },     /* Invalid 3-byte encoding */
    { "\xf0\x80\x80\x80", 0x00000000, 0 }, /* Invalid 4-byte encoding */
    { "\xf0\x90\x80\x80", 0x00010000, 4 }, /* Lowest valid 4-byte encoding */
    { "\xf4\x8f\xbf\xbf", 0x0010ffff, 4 }, /* Highest valid 4-byte encoding */
    /* Next higher 4-byte encoding (old) */
    { "\xf4\x90\x80\x80", 0x00110000, 4 },
    /* Highest 4-byte encoding starting with 0xf4 (old) */
    { "\xf4\xbf\xbf\xbf", 0x0013ffff, 4 },
    /* Next higher 4-byte prefix byte (old) */
    { "\xf5\x80\x80\x80", 0x00140000, L(4) },
    /* Highest valid 4-byte encoding (old) */
    { "\xf7\xbf\xbf\xbf", 0x001fffff, L(4) },
    /* Invalid 4-byte encoding */
    { "\xf7\xff\xff\xff", 0x00000000, L(4) },
    /* Invalid 5-byte encoding */
    { "\xf8\x80\x80\x80\x80", 0x00000000, 0 },
    /* Lowest valid 5-byte encoding (old) */
    { "\xf8\x88\x80\x80\x80", 0x00200000, L(5) },
    /* Highest valid 5-byte encoding (old) */
    { "\xfb\xbf\xbf\xbf\xbf", 0x03ffffff, L(5) },
    /* Invalid 5-byte encoding */
    { "\xfb\xff\xff\xff\xff", 0x00000000, L(5) },
    /* Invalid 6-byte encoding */
    { "\xfc\x80\x80\x80\x80\x80", 0x00000000, 0 },
    /* Lowest valid 6-byte encoding (old) */
    { "\xfc\x84\x80\x80\x80\x80", 0x04000000, L(6) },
    /* Highest valid 6-byte encoding (old) */
    { "\xfd\xbf\xbf\xbf\xbf\xbf", 0x7fffffff, L(6) },
    /* Invalid 6-byte encoding */
    { "\xfd\xff\xff\xff\xff\xff", 0x00000000, L(6) },
};

static void
printhex(const char *p)
{
    for (; *p != '\0'; p++) {
        printf("%02x ", (unsigned char)*p);
    }
}

static void
printtest(struct testcase *t)
{
    printhex(t->p);
    printf("0x%08lx, %d\n", (unsigned long)t->ucs, t->len);
}

static int
test_decode(struct testcase *t, int high4)
{
    int len, status = 0;
    krb5_ucs4 u = 0;

    len = krb5int_utf8_charlen2(t->p);
    if (len != t->len) {
        printf("expected len=%d, got len=%d\n", t->len, len);
        status = 1;
    }
    if ((t->len == 0 || high4) && krb5int_utf8_to_ucs4(t->p, &u) != -1) {
        printf("unexpected success in utf8_to_ucs4\n");
        status = 1;
    }
    if (krb5int_utf8_to_ucs4(t->p, &u) != 0 && t->ucs != 0 && !high4) {
        printf("unexpected failure in utf8_to_ucs4\n");
        status = 1;
    }
    if (t->ucs != u && !high4) {
        printf("expected 0x%08lx, got 0x%08lx\n", (unsigned long)t->ucs,
               (unsigned long)u);
        status = 1;
    }
    return status;
}

static int
test_encode(struct testcase *t, int high4)
{
    size_t size;
    char buf[7];

    memset(buf, 0, sizeof(buf));
    size = krb5int_ucs4_to_utf8(t->ucs, buf);
    if (high4 && size != 0) {
        printf("unexpected success beyond U+10FFFF\n");
        return 1;
    }
    if (!high4 && size == 0) {
        printf("unexpected zero size on encode\n");
        return 1;
    }
    if (size != 0 && strcmp(t->p, buf) != 0) {
        printf("expected ");
        printhex(t->p);
        printf("got ");
        printhex(buf);
        printf("\n");
        return 1;
    }
    return 0;
}

int
main(int argc, char **argv)
{
    size_t ncases = sizeof(testcases) / sizeof(testcases[0]);
    size_t i;
    struct testcase *t;
    int status = 0, verbose = 0;
    /* Is this a "high" 4-byte encoding above U+10FFFF? */
    int high4;

    if (argc == 2 && strcmp(argv[1], "-v") == 0)
        verbose = 1;
    for (i = 0; i < ncases; i++) {
        t = &testcases[i];
        if (verbose)
            printtest(t);
#ifndef OLDENCODINGS
        high4 = t->ucs > 0x10ffff;
#else
        high4 = 0;
#endif
        if (test_decode(t, high4) != 0)
            status = 1;
        if (t->ucs == 0)
            continue;
        if (test_encode(t, high4) != 0)
            status = 1;
    }
    return status;
}