summaryrefslogtreecommitdiffhomepage
diff options
context:
space:
mode:
authorChristoph Mallon <christoph.mallon@gmx.de>2012-10-27 20:58:19 +0200
committerChristoph Mallon <christoph.mallon@gmx.de>2012-10-30 10:39:50 +0100
commitd0df86860fdfd7f24a6259690245c5d9f90006c7 (patch)
tree564fd7fc50217baf3f3bb300d91944e7855a3e94
parent891197cb02f21d5a80de21aa9a94b4557f483f6d (diff)
Implement U, u and u8 strings.
-rw-r--r--ast2firm.c19
-rw-r--r--parser.c35
-rw-r--r--preprocessor.c69
-rw-r--r--string_rep.c7
-rw-r--r--string_rep.h3
-rw-r--r--token.c7
-rw-r--r--types.c20
-rw-r--r--types.h8
8 files changed, 134 insertions, 34 deletions
diff --git a/ast2firm.c b/ast2firm.c
index 11a4844..31aa71d 100644
--- a/ast2firm.c
+++ b/ast2firm.c
@@ -64,7 +64,6 @@ fp_model_t firm_fp_model = fp_model_precise;
static const backend_params *be_params;
static ir_type *ir_type_char;
-static ir_type *ir_type_wchar_t;
/* architecture specific floating point arithmetic mode (if any) */
static ir_mode *mode_float_arithmetic;
@@ -1134,7 +1133,8 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
ir_initializer_t *const initializer = create_initializer_compound(slen);
ir_type * elem_type;
switch (value->encoding) {
- case STRING_ENCODING_CHAR: {
+ case STRING_ENCODING_CHAR:
+ case STRING_ENCODING_UTF8: {
elem_type = ir_type_char;
ir_mode *const mode = get_type_mode(elem_type);
@@ -1147,8 +1147,13 @@ static ir_node *string_to_firm(source_position_t const *const src_pos, char cons
goto finish;
}
- case STRING_ENCODING_WIDE: {
- elem_type = ir_type_wchar_t;
+ {
+ type_t *type;
+ case STRING_ENCODING_CHAR16: type = type_char16_t; goto init_wide;
+ case STRING_ENCODING_CHAR32: type = type_char32_t; goto init_wide;
+ case STRING_ENCODING_WIDE: type = type_wchar_t; goto init_wide;
+init_wide:;
+ elem_type = get_ir_type(type);
ir_mode *const mode = get_type_mode(elem_type);
char const *p = value->begin;
@@ -3826,6 +3831,7 @@ static ir_initializer_t *create_ir_initializer_string(initializer_t const *const
char const * p = str->value.begin;
switch (str->value.encoding) {
case STRING_ENCODING_CHAR:
+ case STRING_ENCODING_UTF8:
for (size_t i = 0; i != arr_len; ++i) {
char const c = i < str_len ? *p++ : 0;
ir_tarval *const tv = new_tarval_from_long(c, mode);
@@ -3834,6 +3840,8 @@ static ir_initializer_t *create_ir_initializer_string(initializer_t const *const
}
break;
+ case STRING_ENCODING_CHAR16:
+ case STRING_ENCODING_CHAR32:
case STRING_ENCODING_WIDE:
for (size_t i = 0; i != arr_len; ++i) {
utf32 const c = i < str_len ? read_utf8_char(&p) : 0;
@@ -5348,8 +5356,7 @@ static void init_ir_types(void)
return;
ir_types_initialized = 1;
- ir_type_char = get_ir_type(type_char);
- ir_type_wchar_t = get_ir_type(type_wchar_t);
+ ir_type_char = get_ir_type(type_char);
be_params = be_get_backend_param();
mode_float_arithmetic = be_params->mode_float_arithmetic;
diff --git a/parser.c b/parser.c
index 869cc25..8b0aadb 100644
--- a/parser.c
+++ b/parser.c
@@ -1060,8 +1060,13 @@ static string_t concat_string_literals(void)
warningf(WARN_TRADITIONAL, HERE, "traditional C rejects string constant concatenation");
string_encoding_t enc = token.literal.string.encoding;
do {
- if (token.literal.string.encoding != STRING_ENCODING_CHAR) {
- enc = token.literal.string.encoding;
+ string_encoding_t const new_enc = token.literal.string.encoding;
+ if (new_enc != enc && new_enc != STRING_ENCODING_CHAR) {
+ if (enc == STRING_ENCODING_CHAR) {
+ enc = new_enc;
+ } else {
+ errorf(HERE, "concatenating string literals with encodings %s and %s", get_string_encoding_prefix(enc), get_string_encoding_prefix(new_enc));
+ }
}
append_string(&token.literal.string);
eat(T_STRING_LITERAL);
@@ -1084,7 +1089,7 @@ static string_t parse_string_literals(char const *const context)
string_t const res = concat_string_literals();
if (res.encoding != STRING_ENCODING_CHAR) {
- errorf(&pos, "expected plain string literal, got wide string literal");
+ errorf(&pos, "expected plain string literal, got %s string literal", get_string_encoding_prefix(res.encoding));
}
return res;
@@ -1565,7 +1570,8 @@ static initializer_t *initializer_from_expression(type_t *orig_type,
array_type_t *const array_type = &type->array;
type_t *const element_type = skip_typeref(array_type->element_type);
switch (expression->string_literal.value.encoding) {
- case STRING_ENCODING_CHAR: {
+ case STRING_ENCODING_CHAR:
+ case STRING_ENCODING_UTF8: {
if (is_type_atomic(element_type, ATOMIC_TYPE_CHAR) ||
is_type_atomic(element_type, ATOMIC_TYPE_SCHAR) ||
is_type_atomic(element_type, ATOMIC_TYPE_UCHAR)) {
@@ -1574,9 +1580,12 @@ static initializer_t *initializer_from_expression(type_t *orig_type,
break;
}
+ case STRING_ENCODING_CHAR16:
+ case STRING_ENCODING_CHAR32:
case STRING_ENCODING_WIDE: {
- type_t *bare_wchar_type = skip_typeref(type_wchar_t);
- if (get_unqualified_type(element_type) == bare_wchar_type) {
+ assert(is_type_pointer(expression->base.type));
+ type_t *const init_type = get_unqualified_type(expression->base.type->pointer.points_to);
+ if (types_compatible(get_unqualified_type(element_type), init_type)) {
make_string_init:;
initializer_t *const init = allocate_initializer_zero(INITIALIZER_STRING);
init->value.value = expression;
@@ -5633,8 +5642,11 @@ static type_t *get_string_type(string_encoding_t const enc)
{
bool const warn = is_warn_on(WARN_WRITE_STRINGS);
switch (enc) {
- case STRING_ENCODING_CHAR: return warn ? type_const_char_ptr : type_char_ptr;
- case STRING_ENCODING_WIDE: return warn ? type_const_wchar_t_ptr : type_wchar_t_ptr;
+ case STRING_ENCODING_CHAR:
+ case STRING_ENCODING_UTF8: return warn ? type_const_char_ptr : type_char_ptr;
+ case STRING_ENCODING_CHAR16: return warn ? type_char16_t_const_ptr : type_char16_t_ptr;
+ case STRING_ENCODING_CHAR32: return warn ? type_char32_t_const_ptr : type_char32_t_ptr;
+ case STRING_ENCODING_WIDE: return warn ? type_const_wchar_t_ptr : type_wchar_t_ptr;
}
panic("invalid string encoding");
}
@@ -5875,6 +5887,7 @@ static expression_t *parse_character_constant(void)
size_t const size = get_string_len(&token.literal.string);
switch (token.literal.string.encoding) {
case STRING_ENCODING_CHAR:
+ case STRING_ENCODING_UTF8:
literal->base.type = c_mode & _CXX ? type_char : type_int;
if (size > 1) {
if (!GNU_MODE && !(c_mode & _C99)) {
@@ -5886,8 +5899,10 @@ static expression_t *parse_character_constant(void)
}
break;
- case STRING_ENCODING_WIDE:
- literal->base.type = type_int;
+ case STRING_ENCODING_CHAR16: literal->base.type = type_char16_t; goto warn_multi;
+ case STRING_ENCODING_CHAR32: literal->base.type = type_char32_t; goto warn_multi;
+ case STRING_ENCODING_WIDE: literal->base.type = type_wchar_t; goto warn_multi;
+warn_multi:
if (size > 1) {
warningf(WARN_MULTICHAR, HERE, "multi-character character constant");
}
diff --git a/preprocessor.c b/preprocessor.c
index 6e3daf3..55c1e3e 100644
--- a/preprocessor.c
+++ b/preprocessor.c
@@ -136,6 +136,11 @@ static symbol_t *symbol_percentcolon;
static symbol_t *symbol_percentcolonpercentcolon;
static symbol_t *symbol_percentgreater;
+static symbol_t *symbol_L;
+static symbol_t *symbol_U;
+static symbol_t *symbol_u;
+static symbol_t *symbol_u8;
+
static void init_symbols(void)
{
symbol_colongreater = symbol_table_insert(":>");
@@ -144,6 +149,11 @@ static void init_symbols(void)
symbol_percentcolon = symbol_table_insert("%:");
symbol_percentcolonpercentcolon = symbol_table_insert("%:%:");
symbol_percentgreater = symbol_table_insert("%>");
+
+ symbol_L = symbol_table_insert("L");
+ symbol_U = symbol_table_insert("U");
+ symbol_u = symbol_table_insert("u");
+ symbol_u8 = symbol_table_insert("u8");
}
void switch_pp_input(FILE *const file, char const *const filename, searchpath_entry_t *const path, bool const is_system_header)
@@ -684,6 +694,18 @@ string_t make_string(char const *const string)
return sym_make_string(STRING_ENCODING_CHAR);
}
+static utf32 get_string_encoding_limit(string_encoding_t const enc)
+{
+ switch (enc) {
+ case STRING_ENCODING_CHAR: return 0xFF;
+ case STRING_ENCODING_CHAR16: return 0xFFFF;
+ case STRING_ENCODING_CHAR32: return 0xFFFFFFFF;
+ case STRING_ENCODING_UTF8: return 0xFFFFFFFF;
+ case STRING_ENCODING_WIDE: return 0xFFFFFFFF; // FIXME depends on settings
+ }
+ panic("invalid string encoding");
+}
+
static void parse_string(utf32 const delimiter, token_kind_t const kind,
string_encoding_t const enc,
char const *const context)
@@ -692,15 +714,16 @@ static void parse_string(utf32 const delimiter, token_kind_t const kind,
eat(delimiter);
+ utf32 const limit = get_string_encoding_limit(enc);
while (true) {
switch (input.c) {
case '\\': {
if (resolve_escape_sequences) {
utf32 const tc = parse_escape_sequence();
+ if (tc > limit) {
+ warningf(WARN_OTHER, &pp_token.base.source_position, "escape sequence out of range");
+ }
if (enc == STRING_ENCODING_CHAR) {
- if (tc >= 0x100) {
- warningf(WARN_OTHER, &pp_token.base.source_position, "escape sequence out of range");
- }
obstack_1grow(&symbol_obstack, tc);
} else {
obstack_grow_utf8(&symbol_obstack, tc);
@@ -1133,6 +1156,17 @@ static inline void eat_token(token_kind_t const kind)
next_input_token();
}
+static string_encoding_t identify_encoding_prefix(symbol_t *const sym)
+{
+ if (sym == symbol_L) return STRING_ENCODING_WIDE;
+ if (c_mode & _C11) {
+ if (sym == symbol_U) return STRING_ENCODING_CHAR32;
+ if (sym == symbol_u) return STRING_ENCODING_CHAR16;
+ if (sym == symbol_u8) return STRING_ENCODING_UTF8;
+ }
+ return STRING_ENCODING_CHAR;
+}
+
static void parse_symbol(void)
{
assert(obstack_object_size(&symbol_obstack) == 0);
@@ -1190,19 +1224,26 @@ end_symbol:
obstack_1grow(&symbol_obstack, '\0');
char *string = obstack_finish(&symbol_obstack);
- /* might be a wide string or character constant ( L"string"/L'c' ) */
- if (input.c == '"' && string[0] == 'L' && string[1] == '\0') {
- obstack_free(&symbol_obstack, string);
- parse_string_literal(STRING_ENCODING_WIDE);
- return;
- } else if (input.c == '\'' && string[0] == 'L' && string[1] == '\0') {
- obstack_free(&symbol_obstack, string);
- parse_character_constant(STRING_ENCODING_WIDE);
- return;
- }
-
symbol_t *symbol = symbol_table_insert(string);
+ /* Might be a prefixed string or character constant: L/U/u/u8"string". */
+ if (input.c == '"') {
+ string_encoding_t const enc = identify_encoding_prefix(symbol);
+ if (enc != STRING_ENCODING_CHAR) {
+ parse_string_literal(enc);
+ return;
+ }
+ } else if (input.c == '\'') {
+ string_encoding_t const enc = identify_encoding_prefix(symbol);
+ if (enc != STRING_ENCODING_CHAR) {
+ if (enc == STRING_ENCODING_UTF8) {
+ errorf(&pp_token.base.source_position, "'u8' is not a valid encoding for a chracter constant");
+ }
+ parse_character_constant(enc);
+ return;
+ }
+ }
+
pp_token.kind = symbol->ID;
pp_token.base.symbol = symbol;
diff --git a/string_rep.c b/string_rep.c
index ff58aad..28b0746 100644
--- a/string_rep.c
+++ b/string_rep.c
@@ -16,8 +16,11 @@ static inline size_t wstrlen(const string_t *string)
size_t get_string_len(string_t const *const str)
{
switch (str->encoding) {
- case STRING_ENCODING_CHAR: return str->size;
- case STRING_ENCODING_WIDE: return wstrlen(str);
+ case STRING_ENCODING_CHAR:
+ case STRING_ENCODING_UTF8: return str->size;
+ case STRING_ENCODING_CHAR16:
+ case STRING_ENCODING_CHAR32:
+ case STRING_ENCODING_WIDE: return wstrlen(str);
}
panic("invalid string encoding");
}
diff --git a/string_rep.h b/string_rep.h
index c0868eb..ce4ca3e 100644
--- a/string_rep.h
+++ b/string_rep.h
@@ -25,6 +25,9 @@
enum string_encoding_t {
STRING_ENCODING_CHAR,
+ STRING_ENCODING_CHAR16,
+ STRING_ENCODING_CHAR32,
+ STRING_ENCODING_UTF8,
STRING_ENCODING_WIDE
};
typedef enum string_encoding_t string_encoding_t;
diff --git a/token.c b/token.c
index 7d2104c..50de656 100644
--- a/token.c
+++ b/token.c
@@ -95,8 +95,11 @@ void print_token_kind(FILE *f, token_kind_t token_kind)
char const *get_string_encoding_prefix(string_encoding_t const enc)
{
switch (enc) {
- case STRING_ENCODING_CHAR: return "";
- case STRING_ENCODING_WIDE: return "L";
+ case STRING_ENCODING_CHAR: return "";
+ case STRING_ENCODING_CHAR16: return "u";
+ case STRING_ENCODING_CHAR32: return "U";
+ case STRING_ENCODING_UTF8: return "u8";
+ case STRING_ENCODING_WIDE: return "L";
}
panic("invalid string encoding");
}
diff --git a/types.c b/types.c
index 24d38d3..d6ff22e 100644
--- a/types.c
+++ b/types.c
@@ -61,6 +61,10 @@ type_t *type_const_void_ptr_restrict;
type_t *type_char_ptr_ptr;
+type_t *type_char16_t;
+type_t *type_char32_t;
+type_t *type_char16_t_const;
+type_t *type_char32_t_const;
type_t *type_intmax_t;
type_t *type_ptrdiff_t;
type_t *type_size_t;
@@ -73,6 +77,10 @@ type_t *type_wint_t;
type_t *type_int32_t;
type_t *type_int64_t;
+type_t *type_char16_t_ptr;
+type_t *type_char32_t_ptr;
+type_t *type_char16_t_const_ptr;
+type_t *type_char32_t_const_ptr;
type_t *type_intmax_t_ptr;
type_t *type_ptrdiff_t_ptr;
type_t *type_ssize_t_ptr;
@@ -198,4 +206,16 @@ void init_wchar_types(atomic_type_kind_t akind)
type_wchar_t_ptr = make_pointer_type(type_wchar_t, TYPE_QUALIFIER_NONE);
type_const_wchar_t_ptr
= make_pointer_type(type_const_wchar_t, TYPE_QUALIFIER_NONE);
+
+ atomic_type_kind_t const u2 = find_unsigned_int_atomic_type_kind_for_size(2);
+ type_char16_t = make_atomic_type(u2, TYPE_QUALIFIER_NONE);
+ type_char16_t_const = make_atomic_type(u2, TYPE_QUALIFIER_CONST);
+ type_char16_t_ptr = make_pointer_type(type_char16_t, TYPE_QUALIFIER_NONE);
+ type_char16_t_const_ptr = make_pointer_type(type_char16_t_const, TYPE_QUALIFIER_NONE);
+
+ atomic_type_kind_t const u4 = find_unsigned_int_atomic_type_kind_for_size(4);
+ type_char32_t = make_atomic_type(u4, TYPE_QUALIFIER_NONE);
+ type_char32_t_const = make_atomic_type(u4, TYPE_QUALIFIER_CONST);
+ type_char32_t_ptr = make_pointer_type(type_char32_t, TYPE_QUALIFIER_NONE);
+ type_char32_t_const_ptr = make_pointer_type(type_char32_t_const, TYPE_QUALIFIER_NONE);
}
diff --git a/types.h b/types.h
index b173a00..e2cc949 100644
--- a/types.h
+++ b/types.h
@@ -60,6 +60,10 @@ extern type_t *type_const_void_ptr_restrict;
extern type_t *type_char_ptr_ptr;
+extern type_t *type_char16_t;
+extern type_t *type_char32_t;
+extern type_t *type_char16_t_const;
+extern type_t *type_char32_t_const;
extern type_t *type_intmax_t;
extern type_t *type_ptrdiff_t;
extern type_t *type_size_t;
@@ -73,6 +77,10 @@ extern type_t *type_wint_t;
extern type_t *type_int32_t;
extern type_t *type_int64_t;
+extern type_t *type_char16_t_ptr;
+extern type_t *type_char32_t_ptr;
+extern type_t *type_char16_t_const_ptr;
+extern type_t *type_char32_t_const_ptr;
extern type_t *type_intmax_t_ptr;
extern type_t *type_ptrdiff_t_ptr;
extern type_t *type_ssize_t_ptr;