UFO: Alien Invasion
 All Data Structures Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Modules Pages
utf8.cpp
Go to the documentation of this file.
1 
5 /*
6 All original material Copyright (C) 2002-2020 UFO: Alien Invasion.
7 
8 Copyright (C) 1997-2001 Id Software, Inc.
9 
10 This program is free software; you can redistribute it and/or
11 modify it under the terms of the GNU General Public License
12 as published by the Free Software Foundation; either version 2
13 of the License, or (at your option) any later version.
14 
15 This program is distributed in the hope that it will be useful,
16 but WITHOUT ANY WARRANTY; without even the implied warranty of
17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
18 
19 See the GNU General Public License for more details.
20 
21 You should have received a copy of the GNU General Public License
22 along with this program; if not, write to the Free Software
23 Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
24 */
25 
26 #include "utf8.h"
27 #include <string.h>
28 
35 int UTF8_delete_char_at (char* s, int pos)
36 {
37  /* Convert the UTF-8 char offset to byte offset */
38  pos = UTF8_char_offset_to_byte_offset(s, pos);
39 
40  int start = pos;
41  int next = pos;
42 
43  while (start > 0 && UTF8_CONTINUATION_BYTE(s[start]))
44  start--;
45  if (s[next] != 0)
46  next++;
47  while (s[next] != 0 && UTF8_CONTINUATION_BYTE(s[next]))
48  next++;
49  /* memmove is the only standard copying function that is guaranteed
50  * to work if the source and destination overlap. */
51  memmove(&s[start], &s[next], strlen(&s[next]) + 1);
52  return (next - start);
53 }
54 
63 int UTF8_insert_char_at (char* s, int n, int pos, int c)
64 {
65  /* Convert the UTF-8 char offset to byte offset */
66  pos = UTF8_char_offset_to_byte_offset(s, pos);
67 
68  const int utf8len = UTF8_encoded_len(c);
69  const int tail = strlen(&s[pos]) + 1;
70 
71  if (utf8len == 0)
72  return 0;
73 
74  if (pos + tail + utf8len > n)
75  return 0;
76 
77  /* Insertion: move up rest of string. Also moves string terminator. */
78  memmove(&s[pos + utf8len], &s[pos], tail);
79 
80  if (c <= 0x7f) {
81  s[pos] = c;
82  } else if (c <= 0x7ff) { /* c has 11 bits */
83  s[pos] = 0xc0 | (c >> 6); /* high 5 bits */
84  s[pos + 1] = 0x80 | (c & 0x3f); /* low 6 bits */
85  } else if (c <= 0xffff) { /* c has 16 bits */
86  s[pos] = 0xe0 | (c >> 12); /* high 4 bits */
87  s[pos + 1] = 0x80 | ((c >> 6) & 0x3f); /* mid 6 bits */
88  s[pos + 2] = 0x80 | (c & 0x3f); /* low 6 bits */
89  } else if (c <= 0x10ffff) { /* c has 21 bits */
90  s[pos] = 0xf0 | (c >> 18); /* high 3 bits */
91  s[pos + 1] = 0x80 | ((c >> 12) & 0x3f); /* mid 6 bits */
92  s[pos + 2] = 0x80 | ((c >> 6) & 0x3f); /* mid 6 bits */
93  s[pos + 3] = 0x80 | (c & 0x3f); /* low 6 bits */
94  }
95 
96  return utf8len;
97 }
98 
109 int UTF8_char_len (unsigned char c)
110 {
111  if (c < 0x80)
112  return 1;
113  if (c < 0xc0)
114  return 0;
115  if (c < 0xe0)
116  return 2;
117  if (c < 0xf0)
118  return 3;
119  if (c < 0xf8)
120  return 4;
121  /* UTF-8 used to define 5 and 6 byte sequences, but they are
122  * no longer valid. */
123  return 0;
124 }
125 
132 int UTF8_next (const char** str)
133 {
134  size_t len, i;
135  int cp, min;
136  const char* s = *str;
137 
138  if (s[0] == '\0')
139  return -1;
140 
141  const unsigned char* buf = (const unsigned char*)(s);
142 
143  if (buf[0] < 0x80) {
144  len = 1;
145  min = 0;
146  cp = buf[0];
147  } else if (buf[0] < 0xC0) {
148  return -1;
149  } else if (buf[0] < 0xE0) {
150  len = 2;
151  min = 1 << 7;
152  cp = buf[0] & 0x1F;
153  } else if (buf[0] < 0xF0) {
154  len = 3;
155  min = 1 << (5 + 6);
156  cp = buf[0] & 0x0F;
157  } else if (buf[0] < 0xF8) {
158  len = 4;
159  min = 1 << (4 + 6 + 6);
160  cp = buf[0] & 0x07;
161  } else {
162  return -1;
163  }
164 
165  for (i = 1; i < len; i++) {
166  if (!UTF8_CONTINUATION_BYTE(buf[i]))
167  return -1;
168  cp = (cp << 6) | (buf[i] & 0x3F);
169  }
170 
171  if (cp < min)
172  return -1;
173 
174  if (0xD800 <= cp && cp <= 0xDFFF)
175  return -1;
176 
177  if (0x110000 <= cp)
178  return -1;
179 
180  *str += len;
181  return cp;
182 }
183 
188 int UTF8_encoded_len (int c)
189 {
190  if (c <= 0x7F)
191  return 1;
192  if (c <= 0x07FF)
193  return 2;
194  if (c <= 0xFFFF)
195  return 3;
196  if (c <= 0x10FFFF) /* highest defined Unicode code */
197  return 4;
198  return 0;
199 }
200 
207 size_t UTF8_strlen (const char* str)
208 {
209  size_t result = 0;
210 
211  while (str[0] != '\0') {
212  const int n = UTF8_char_len((unsigned char)*str);
213  str += n;
214  result++;
215  }
216  return result;
217 }
218 
227 int UTF8_char_offset_to_byte_offset (char* str, int pos)
228 {
229  int result = 0;
230 
231  while (pos > 0 && str[0] != '\0') {
232  const int n = UTF8_char_len((unsigned char)*str);
233  str += n;
234  result += n;
235  pos--;
236  }
237  return result;
238 }
239 
247 char* UTF8_strncpyz (char* dest, const char* src, size_t limit)
248 {
249  size_t length;
250 
251  length = strlen(src);
252  if (length > limit - 1) {
253  length = limit - 1;
254  if (length > 0 && (unsigned char) src[length - 1] >= 0x80) {
255  size_t i = length - 1;
256  while ((i > 0) && UTF8_CONTINUATION_BYTE((unsigned char) src[i]))
257  i--;
258  if (UTF8_char_len(src[i]) + i > length)
259  length = i;
260  }
261  }
262 
263  memcpy(dest, src, length);
264  dest[length] = '\0';
265 
266  return dest;
267 }
int UTF8_char_offset_to_byte_offset(char *str, int pos)
Convert UTF-8 character offset to a byte offset in the given string.
Definition: utf8.cpp:227
size_t UTF8_strlen(const char *str)
Count the number of character (not the number of bytes) of a zero termination string.
Definition: utf8.cpp:207
char * UTF8_strncpyz(char *dest, const char *src, size_t limit)
UTF8 capable string copy function.
Definition: utf8.cpp:247
int UTF8_char_len(unsigned char c)
length of UTF-8 character starting with this byte.
Definition: utf8.cpp:109
int UTF8_next(const char **str)
Get the next utf-8 character from the given string.
Definition: utf8.cpp:132
voidpf void * buf
Definition: ioapi.h:42
#define UTF8_CONTINUATION_BYTE(c)
Definition: utf8.h:35
int UTF8_delete_char_at(char *s, int pos)
Delete a whole (possibly multibyte) character from a string.
Definition: utf8.cpp:35
QGL_EXTERN GLuint GLsizei GLsizei * length
Definition: r_gl.h:110
int UTF8_insert_char_at(char *s, int n, int pos, int c)
Insert a (possibly multibyte) UTF-8 character into a string.
Definition: utf8.cpp:63
int UTF8_encoded_len(int c)
Definition: utf8.cpp:188
QGL_EXTERN GLenum GLuint * dest
Definition: r_gl.h:101
QGL_EXTERN GLint i
Definition: r_gl.h:113
QGL_EXTERN GLuint GLchar GLuint * len
Definition: r_gl.h:99