From patchwork Tue Nov 22 19:41:15 2022 Content-Type: text/plain; charset="utf-8" MIME-Version: 1.0 Content-Transfer-Encoding: 8bit X-Patchwork-Submitter: Stefan Schantl X-Patchwork-Id: 6158 Return-Path: Received: from mail01.ipfire.org (mail01.haj.ipfire.org [172.28.1.202]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384 client-signature ECDSA (P-384) client-digest SHA384) (Client CN "mail01.haj.ipfire.org", Issuer "R3" (verified OK)) by web04.haj.ipfire.org (Postfix) with ESMTPS id 4NGvlH2rSZz3wgq for ; Tue, 22 Nov 2022 19:41:31 +0000 (UTC) Received: from mail02.haj.ipfire.org (mail02.haj.ipfire.org [172.28.1.201]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) client-signature ECDSA (P-384)) (Client CN "mail02.haj.ipfire.org", Issuer "R3" (verified OK)) by mail01.ipfire.org (Postfix) with ESMTPS id 4NGvlF5fFDz2mv; Tue, 22 Nov 2022 19:41:29 +0000 (UTC) Received: from mail02.haj.ipfire.org (localhost [127.0.0.1]) by mail02.haj.ipfire.org (Postfix) with ESMTP id 4NGvlF4jXlz2xlg; Tue, 22 Nov 2022 19:41:29 +0000 (UTC) Received: from mail01.ipfire.org (mail01.haj.ipfire.org [172.28.1.202]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384 client-signature ECDSA (P-384) client-digest SHA384) (Client CN "mail01.haj.ipfire.org", Issuer "R3" (verified OK)) by mail02.haj.ipfire.org (Postfix) with ESMTPS id 4NGvlF0QVqz2xK9 for ; Tue, 22 Nov 2022 19:41:29 +0000 (UTC) Received: from [127.0.0.1] (localhost [127.0.0.1]) (using TLSv1.3 with cipher TLS_AES_256_GCM_SHA384 (256/256 bits) key-exchange X25519 server-signature ECDSA (P-384) server-digest SHA384) (No client certificate requested) by mail01.ipfire.org (Postfix) with ESMTPSA id 4NGvl91fkjz1GZ; Tue, 22 Nov 2022 19:41:25 +0000 (UTC) DKIM-Signature: v=1; a=ed25519-sha256; c=relaxed/relaxed; d=ipfire.org; s=202003ed25519; t=1669146088; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding; bh=RXCYiPdClwKTk8SWbudonzpZwxUxUYRVK7AjiK3CP8Y=; b=i5Ma4XurggFAoOfHRznrF2bdD20wEFSvmxxQOBiSd8LHbXwSys1gukrBt5m+Pjs/G+82vw Ui1Z+AtXSL+vkTCw== DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/relaxed; d=ipfire.org; s=202003rsa; t=1669146088; h=from:from:reply-to:subject:subject:date:date:message-id:message-id: to:to:cc:cc:mime-version:mime-version:content-type:content-type: content-transfer-encoding:content-transfer-encoding; bh=RXCYiPdClwKTk8SWbudonzpZwxUxUYRVK7AjiK3CP8Y=; b=JLHLiAuJaa0UPz+HHg2dVFSNxvKT37K7UsCOgrsKvCaYU6ig6GqVQxQo/8+RO41qSgZEJ/ pfE+faRtIBXT3Hkzq2RK4YKYko7CwKSr1po2NkQ8lTwkepnDI48Ggsl1kQKIXbHHxHKFvF YzlSxsAVo7TkztSy4dKoOEMtEbT3EI4G8twNUbydl9Ck8aqICok//gFbbWTWA5sd3694yb 65BB8OUyzziXLbR1fHc7OgHkap6kbbpvqapOL6zMEHrT0DteRIfzp2PalJ/l0fdWvgqr00 vQkPgI/sMWLzVe3z78VsmUlRAMtW/RIkr/zU30wg2ypstHTN6MHBrd4MgkNHvg== From: Stefan Schantl To: development@lists.ipfire.org Subject: [PATCH] coreutils: Update to 9.1 Date: Tue, 22 Nov 2022 20:41:15 +0100 Message-Id: <20221122194115.320535-1-stefan.schantl@ipfire.org> MIME-Version: 1.0 X-BeenThere: development@lists.ipfire.org X-Mailman-Version: 2.1.29 Precedence: list List-Id: IPFire development talk List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , Errors-To: development-bounces@lists.ipfire.org Sender: "Development" All deleted i18n subpatches are now part of the i18n main patch and therefore not longer required. Signed-off-by: Stefan Schantl --- coreutils/coreutils.nm | 2 +- .../patches/coreutils-i18n-cut-old.patch | 565 ---- .../coreutils-i18n-expand-unexpand.patch | 848 ------ .../patches/coreutils-i18n-fix-unexpand.patch | 28 - .../coreutils-i18n-fix2-expand-unexpand.patch | 108 - .../patches/coreutils-i18n-sort-human.patch | 35 - .../coreutils-i18n-un-expand-BOM.patch | 456 --- coreutils/patches/coreutils-i18n.patch | 2546 ++++++++++++++--- 8 files changed, 2160 insertions(+), 2428 deletions(-) delete mode 100644 coreutils/patches/coreutils-i18n-cut-old.patch delete mode 100644 coreutils/patches/coreutils-i18n-expand-unexpand.patch delete mode 100644 coreutils/patches/coreutils-i18n-fix-unexpand.patch delete mode 100644 coreutils/patches/coreutils-i18n-fix2-expand-unexpand.patch delete mode 100644 coreutils/patches/coreutils-i18n-sort-human.patch delete mode 100644 coreutils/patches/coreutils-i18n-un-expand-BOM.patch diff --git a/coreutils/coreutils.nm b/coreutils/coreutils.nm index 026bc14da..888c6afe2 100644 --- a/coreutils/coreutils.nm +++ b/coreutils/coreutils.nm @@ -4,7 +4,7 @@ ############################################################################### name = coreutils -version = 8.31 +version = 9.1 release = 1 groups = System/Base diff --git a/coreutils/patches/coreutils-i18n-cut-old.patch b/coreutils/patches/coreutils-i18n-cut-old.patch deleted file mode 100644 index 757ee0fbb..000000000 --- a/coreutils/patches/coreutils-i18n-cut-old.patch +++ /dev/null @@ -1,565 +0,0 @@ -diff --git a/src/cut.c b/src/cut.c -index 7ab6be4..022d0ad 100644 ---- a/src/cut.c -+++ b/src/cut.c -@@ -28,6 +28,11 @@ - #include - #include - #include -+ -+/* Get mbstate_t, mbrtowc(). */ -+#if HAVE_WCHAR_H -+# include -+#endif - #include "system.h" - - #include "error.h" -@@ -38,6 +43,18 @@ - - #include "set-fields.h" - -+/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC -+ installation; work around this configuration error. */ -+#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 -+# undef MB_LEN_MAX -+# define MB_LEN_MAX 16 -+#endif -+ -+/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ -+#if HAVE_MBRTOWC && defined mbstate_t -+# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) -+#endif -+ - /* The official name of this program (e.g., no 'g' prefix). */ - #define PROGRAM_NAME "cut" - -@@ -54,6 +71,52 @@ - } \ - while (0) - -+/* Refill the buffer BUF to get a multibyte character. */ -+#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \ -+ do \ -+ { \ -+ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \ -+ { \ -+ memmove (BUF, BUFPOS, BUFLEN); \ -+ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \ -+ BUFPOS = BUF; \ -+ } \ -+ } \ -+ while (0) -+ -+/* Get wide character on BUFPOS. BUFPOS is not included after that. -+ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */ -+#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \ -+ do \ -+ { \ -+ mbstate_t state_bak; \ -+ \ -+ if (BUFLEN < 1) \ -+ { \ -+ WC = WEOF; \ -+ break; \ -+ } \ -+ \ -+ /* Get a wide character. */ \ -+ CONVFAIL = false; \ -+ state_bak = STATE; \ -+ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \ -+ \ -+ switch (MBLENGTH) \ -+ { \ -+ case (size_t)-1: \ -+ case (size_t)-2: \ -+ CONVFAIL = true; \ -+ STATE = state_bak; \ -+ /* Fall througn. */ \ -+ \ -+ case 0: \ -+ MBLENGTH = 1; \ -+ break; \ -+ } \ -+ } \ -+ while (0) -+ - - /* Pointer inside RP. When checking if a byte or field is selected - by a finite range, we check if it is between CURRENT_RP.LO -@@ -61,6 +124,9 @@ - CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */ - static struct field_range_pair *current_rp; - -+/* Length of the delimiter given as argument to -d. */ -+size_t delimlen; -+ - /* This buffer is used to support the semantics of the -s option - (or lack of same) when the specified field list includes (does - not include) the first field. In both of those cases, the entire -@@ -77,15 +143,25 @@ enum operating_mode - { - undefined_mode, - -- /* Output characters that are in the given bytes. */ -+ /* Output bytes that are at the given positions. */ - byte_mode, - -+ /* Output characters that are at the given positions. */ -+ character_mode, -+ - /* Output the given delimiter-separated fields. */ - field_mode - }; - - static enum operating_mode operating_mode; - -+/* If nonzero, when in byte mode, don't split multibyte characters. */ -+static int byte_mode_character_aware; -+ -+/* If nonzero, the function for single byte locale is work -+ if this program runs on multibyte locale. */ -+static int force_singlebyte_mode; -+ - /* If true do not output lines containing no delimiter characters. - Otherwise, all such lines are printed. This option is valid only - with field mode. */ -@@ -97,6 +173,9 @@ static bool complement; - - /* The delimiter character for field mode. */ - static unsigned char delim; -+#if HAVE_WCHAR_H -+static wchar_t wcdelim; -+#endif - - /* The delimiter for each line/record. */ - static unsigned char line_delim = '\n'; -@@ -164,7 +243,7 @@ Print selected parts of lines from each FILE to standard output.\n\ - -f, --fields=LIST select only these fields; also print any line\n\ - that contains no delimiter character, unless\n\ - the -s option is specified\n\ -- -n (ignored)\n\ -+ -n with -b: don't split multibyte characters\n\ - "), stdout); - fputs (_("\ - --complement complement the set of selected bytes, characters\n\ -@@ -280,6 +359,82 @@ cut_bytes (FILE *stream) - } - } - -+#if HAVE_MBRTOWC -+/* This function is in use for the following case. -+ -+ 1. Read from the stream STREAM, printing to standard output any selected -+ characters. -+ -+ 2. Read from stream STREAM, printing to standard output any selected bytes, -+ without splitting multibyte characters. */ -+ -+static void -+cut_characters_or_cut_bytes_no_split (FILE *stream) -+{ -+ uintmax_t idx; /* number of bytes or characters in the line so far. */ -+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ -+ char *bufpos; /* Next read position of BUF. */ -+ size_t buflen; /* The length of the byte sequence in buf. */ -+ wint_t wc; /* A gotten wide character. */ -+ size_t mblength; /* The byte size of a multibyte character which shows -+ as same character as WC. */ -+ mbstate_t state; /* State of the stream. */ -+ bool convfail = false; /* true, when conversion failed. Otherwise false. */ -+ /* Whether to begin printing delimiters between ranges for the current line. -+ Set after we've begun printing data corresponding to the first range. */ -+ bool print_delimiter = false; -+ -+ idx = 0; -+ buflen = 0; -+ bufpos = buf; -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ current_rp = frp; -+ -+ while (1) -+ { -+ REFILL_BUFFER (buf, bufpos, buflen, stream); -+ -+ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail); -+ (void) convfail; /* ignore unused */ -+ -+ if (wc == WEOF) -+ { -+ if (idx > 0) -+ putchar (line_delim); -+ break; -+ } -+ else if (wc == line_delim) -+ { -+ putchar (line_delim); -+ idx = 0; -+ print_delimiter = false; -+ current_rp = frp; -+ } -+ else -+ { -+ next_item (&idx); -+ if (print_kth (idx)) -+ { -+ if (output_delimiter_specified) -+ { -+ if (print_delimiter && is_range_start_index (idx)) -+ { -+ fwrite (output_delimiter_string, sizeof (char), -+ output_delimiter_length, stdout); -+ } -+ print_delimiter = true; -+ } -+ fwrite (bufpos, mblength, sizeof(char), stdout); -+ } -+ } -+ -+ buflen -= mblength; -+ bufpos += mblength; -+ } -+} -+#endif -+ - /* Read from stream STREAM, printing to standard output any selected fields. */ - - static void -@@ -425,13 +580,211 @@ cut_fields (FILE *stream) - } - } - -+#if HAVE_MBRTOWC -+static void -+cut_fields_mb (FILE *stream) -+{ -+ int c; -+ uintmax_t field_idx; -+ int found_any_selected_field; -+ int buffer_first_field; -+ int empty_input; -+ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ -+ char *bufpos; /* Next read position of BUF. */ -+ size_t buflen; /* The length of the byte sequence in buf. */ -+ wint_t wc = 0; /* A gotten wide character. */ -+ size_t mblength; /* The byte size of a multibyte character which shows -+ as same character as WC. */ -+ mbstate_t state; /* State of the stream. */ -+ bool convfail = false; /* true, when conversion failed. Otherwise false. */ -+ -+ current_rp = frp; -+ -+ found_any_selected_field = 0; -+ field_idx = 1; -+ bufpos = buf; -+ buflen = 0; -+ memset (&state, '\0', sizeof(mbstate_t)); -+ -+ c = getc (stream); -+ empty_input = (c == EOF); -+ if (c != EOF) -+ { -+ ungetc (c, stream); -+ wc = 0; -+ } -+ else -+ wc = WEOF; -+ -+ /* To support the semantics of the -s flag, we may have to buffer -+ all of the first field to determine whether it is `delimited.' -+ But that is unnecessary if all non-delimited lines must be printed -+ and the first field has been selected, or if non-delimited lines -+ must be suppressed and the first field has *not* been selected. -+ That is because a non-delimited line has exactly one field. */ -+ buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); -+ -+ while (1) -+ { -+ if (field_idx == 1 && buffer_first_field) -+ { -+ int len = 0; -+ -+ while (1) -+ { -+ REFILL_BUFFER (buf, bufpos, buflen, stream); -+ -+ GET_NEXT_WC_FROM_BUFFER -+ (wc, bufpos, buflen, mblength, state, convfail); -+ -+ if (wc == WEOF) -+ break; -+ -+ field_1_buffer = xrealloc (field_1_buffer, len + mblength); -+ memcpy (field_1_buffer + len, bufpos, mblength); -+ len += mblength; -+ buflen -= mblength; -+ bufpos += mblength; -+ -+ if (!convfail && (wc == line_delim || wc == wcdelim)) -+ break; -+ } -+ -+ if (len <= 0 && wc == WEOF) -+ break; -+ -+ /* If the first field extends to the end of line (it is not -+ delimited) and we are printing all non-delimited lines, -+ print this one. */ -+ if (convfail || (!convfail && wc != wcdelim)) -+ { -+ if (suppress_non_delimited) -+ { -+ /* Empty. */ -+ } -+ else -+ { -+ fwrite (field_1_buffer, sizeof (char), len, stdout); -+ /* Make sure the output line is newline terminated. */ -+ if (convfail || (!convfail && wc != line_delim)) -+ putchar (line_delim); -+ } -+ continue; -+ } -+ -+ if (print_kth (1)) -+ { -+ /* Print the field, but not the trailing delimiter. */ -+ fwrite (field_1_buffer, sizeof (char), len - 1, stdout); -+ found_any_selected_field = 1; -+ } -+ next_item (&field_idx); -+ } -+ -+ if (wc != WEOF) -+ { -+ if (print_kth (field_idx)) -+ { -+ if (found_any_selected_field) -+ { -+ fwrite (output_delimiter_string, sizeof (char), -+ output_delimiter_length, stdout); -+ } -+ found_any_selected_field = 1; -+ } -+ -+ while (1) -+ { -+ REFILL_BUFFER (buf, bufpos, buflen, stream); -+ -+ GET_NEXT_WC_FROM_BUFFER -+ (wc, bufpos, buflen, mblength, state, convfail); -+ -+ if (wc == WEOF) -+ break; -+ else if (!convfail && (wc == wcdelim || wc == line_delim)) -+ { -+ buflen -= mblength; -+ bufpos += mblength; -+ break; -+ } -+ -+ if (print_kth (field_idx)) -+ fwrite (bufpos, mblength, sizeof(char), stdout); -+ -+ buflen -= mblength; -+ bufpos += mblength; -+ } -+ } -+ -+ if ((!convfail || wc == line_delim) && buflen < 1) -+ wc = WEOF; -+ -+ if (!convfail && wc == wcdelim) -+ next_item (&field_idx); -+ else if (wc == WEOF || (!convfail && wc == line_delim)) -+ { -+ if (found_any_selected_field -+ || (!empty_input && !(suppress_non_delimited && field_idx == 1))) -+ putchar (line_delim); -+ if (wc == WEOF) -+ break; -+ field_idx = 1; -+ current_rp = frp; -+ found_any_selected_field = 0; -+ } -+ } -+} -+#endif -+ - static void - cut_stream (FILE *stream) - { -- if (operating_mode == byte_mode) -- cut_bytes (stream); -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) -+ { -+ switch (operating_mode) -+ { -+ case byte_mode: -+ if (byte_mode_character_aware) -+ cut_characters_or_cut_bytes_no_split (stream); -+ else -+ cut_bytes (stream); -+ break; -+ -+ case character_mode: -+ cut_characters_or_cut_bytes_no_split (stream); -+ break; -+ -+ case field_mode: -+ if (delimlen == 1) -+ { -+ /* Check if we have utf8 multibyte locale, so we can use this -+ optimization because of uniqueness of characters, which is -+ not true for e.g. SJIS */ -+ char * loc = setlocale(LC_CTYPE, NULL); -+ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") || -+ strstr (loc, "UTF8") || strstr (loc, "utf8"))) -+ { -+ cut_fields (stream); -+ break; -+ } -+ } -+ cut_fields_mb (stream); -+ break; -+ -+ default: -+ abort (); -+ } -+ } - else -- cut_fields (stream); -+#endif -+ { -+ if (operating_mode == field_mode) -+ cut_fields (stream); -+ else -+ cut_bytes (stream); -+ } - } - - /* Process file FILE to standard output. -@@ -483,6 +836,7 @@ main (int argc, char **argv) - bool ok; - bool delim_specified = false; - char *spec_list_string IF_LINT ( = NULL); -+ char mbdelim[MB_LEN_MAX + 1]; - - initialize_main (&argc, &argv); - set_program_name (argv[0]); -@@ -505,7 +859,6 @@ main (int argc, char **argv) - switch (optc) - { - case 'b': -- case 'c': - /* Build the byte list. */ - if (operating_mode != undefined_mode) - FATAL_ERROR (_("only one type of list may be specified")); -@@ -513,6 +866,14 @@ main (int argc, char **argv) - spec_list_string = optarg; - break; - -+ case 'c': -+ /* Build the character list. */ -+ if (operating_mode != undefined_mode) -+ FATAL_ERROR (_("only one type of list may be specified")); -+ operating_mode = character_mode; -+ spec_list_string = optarg; -+ break; -+ - case 'f': - /* Build the field list. */ - if (operating_mode != undefined_mode) -@@ -524,10 +885,38 @@ main (int argc, char **argv) - case 'd': - /* New delimiter. */ - /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ -- if (optarg[0] != '\0' && optarg[1] != '\0') -- FATAL_ERROR (_("the delimiter must be a single character")); -- delim = optarg[0]; -- delim_specified = true; -+ { -+#if HAVE_MBRTOWC -+ if(MB_CUR_MAX > 1) -+ { -+ mbstate_t state; -+ -+ memset (&state, '\0', sizeof(mbstate_t)); -+ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state); -+ -+ if (delimlen == (size_t)-1 || delimlen == (size_t)-2) -+ ++force_singlebyte_mode; -+ else -+ { -+ delimlen = (delimlen < 1) ? 1 : delimlen; -+ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0') -+ FATAL_ERROR (_("the delimiter must be a single character")); -+ memcpy (mbdelim, optarg, delimlen); -+ mbdelim[delimlen] = '\0'; -+ if (delimlen == 1) -+ delim = *optarg; -+ } -+ } -+ -+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) -+#endif -+ { -+ if (optarg[0] != '\0' && optarg[1] != '\0') -+ FATAL_ERROR (_("the delimiter must be a single character")); -+ delim = (unsigned char) optarg[0]; -+ } -+ delim_specified = true; -+ } - break; - - case OUTPUT_DELIMITER_OPTION: -@@ -540,6 +929,7 @@ main (int argc, char **argv) - break; - - case 'n': -+ byte_mode_character_aware = 1; - break; - - case 's': -@@ -579,15 +969,34 @@ main (int argc, char **argv) - | (complement ? SETFLD_COMPLEMENT : 0) ); - - if (!delim_specified) -- delim = '\t'; -+ { -+ delim = '\t'; -+#ifdef HAVE_MBRTOWC -+ wcdelim = L'\t'; -+ mbdelim[0] = '\t'; -+ mbdelim[1] = '\0'; -+ delimlen = 1; -+#endif -+ } - - if (output_delimiter_string == NULL) - { -- static char dummy[2]; -- dummy[0] = delim; -- dummy[1] = '\0'; -- output_delimiter_string = dummy; -- output_delimiter_length = 1; -+#ifdef HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) -+ { -+ output_delimiter_string = xstrdup(mbdelim); -+ output_delimiter_length = delimlen; -+ } -+ -+ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) -+#endif -+ { -+ static char dummy[2]; -+ dummy[0] = delim; -+ dummy[1] = '\0'; -+ output_delimiter_string = dummy; -+ output_delimiter_length = 1; -+ } - } - - if (optind == argc) diff --git a/coreutils/patches/coreutils-i18n-expand-unexpand.patch b/coreutils/patches/coreutils-i18n-expand-unexpand.patch deleted file mode 100644 index b5f571f5f..000000000 --- a/coreutils/patches/coreutils-i18n-expand-unexpand.patch +++ /dev/null @@ -1,848 +0,0 @@ -From e87ab5b991b08092a7e07af82b3ec822a8604151 Mon Sep 17 00:00:00 2001 -From: Ondrej Oprala -Date: Wed, 5 Aug 2015 09:15:09 +0200 -Subject: [PATCH] expand,unexpand: add multibyte support -MIME-Version: 1.0 -Content-Type: text/plain; charset=UTF-8 -Content-Transfer-Encoding: 8bit - -* NEWS: Mention the changes. -* bootstrap.conf: Add mbfile to the list of modules. -* configure.ac: Properly initialize mbfile. -* src/expand.c (expand): Iterate over multibyte characters properly. -* src/unexpand.c (unexpand): Iterate over multibyte characters -properly. -* tests/local.mk: Add new tests. -* tests/{expand,unexpand}/mb.sh: New tests. - -Co-authored-by: Pádraig Brady ---- - bootstrap.conf | 1 + - configure.ac | 2 + - lib/mbfile.c | 3 + - lib/mbfile.h | 255 +++++++++++++++++++++++++++++++++++++++++++++++++++ - m4/mbfile.m4 | 14 +++ - src/expand.c | 43 +++++---- - src/unexpand.c | 54 +++++++---- - tests/expand/mb.sh | 98 ++++++++++++++++++++ - tests/local.mk | 2 + - tests/unexpand/mb.sh | 97 ++++++++++++++++++++ - 10 files changed, 535 insertions(+), 34 deletions(-) - create mode 100644 lib/mbfile.c - create mode 100644 lib/mbfile.h - create mode 100644 m4/mbfile.m4 - create mode 100755 tests/expand/mb.sh - create mode 100755 tests/unexpand/mb.sh - -diff --git a/bootstrap.conf b/bootstrap.conf -index 8a0ff31..a1c78b2 100644 ---- a/bootstrap.conf -+++ b/bootstrap.conf -@@ -152,6 +152,7 @@ gnulib_modules=" - maintainer-makefile - malloc-gnu - manywarnings -+ mbfile - mbrlen - mbrtowc - mbsalign -diff --git a/configure.ac b/configure.ac -index 1e74b36..24c9725 100644 ---- a/configure.ac -+++ b/configure.ac -@@ -427,6 +427,8 @@ fi - # I'm leaving it here for now. This whole thing needs to be modernized... - gl_WINSIZE_IN_PTEM - -+gl_MBFILE -+ - gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H - - if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \ -diff --git a/lib/mbfile.c b/lib/mbfile.c -new file mode 100644 -index 0000000..b0a468e ---- /dev/null -+++ b/lib/mbfile.c -@@ -0,0 +1,3 @@ -+#include -+#define MBFILE_INLINE _GL_EXTERN_INLINE -+#include "mbfile.h" -diff --git a/lib/mbfile.h b/lib/mbfile.h -new file mode 100644 -index 0000000..11f1b12 ---- /dev/null -+++ b/lib/mbfile.h -@@ -0,0 +1,255 @@ -+/* Multibyte character I/O: macros for multi-byte encodings. -+ Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc. -+ -+ This program is free software: you can redistribute it and/or modify -+ it under the terms of the GNU General Public License as published by -+ the Free Software Foundation; either version 3 of the License, or -+ (at your option) any later version. -+ -+ This program is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ GNU General Public License for more details. -+ -+ You should have received a copy of the GNU General Public License -+ along with this program. If not, see . */ -+ -+/* Written by Mitsuru Chinen -+ and Bruno Haible . */ -+ -+/* The macros in this file implement multi-byte character input from a -+ stream. -+ -+ mb_file_t -+ is the type for multibyte character input stream, usable for variable -+ declarations. -+ -+ mbf_char_t -+ is the type for multibyte character or EOF, usable for variable -+ declarations. -+ -+ mbf_init (mbf, stream) -+ initializes the MB_FILE for reading from stream. -+ -+ mbf_getc (mbc, mbf) -+ reads the next multibyte character from mbf and stores it in mbc. -+ -+ mb_iseof (mbc) -+ returns true if mbc represents the EOF value. -+ -+ Here are the function prototypes of the macros. -+ -+ extern void mbf_init (mb_file_t mbf, FILE *stream); -+ extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf); -+ extern bool mb_iseof (const mbf_char_t mbc); -+ */ -+ -+#ifndef _MBFILE_H -+#define _MBFILE_H 1 -+ -+#include -+#include -+#include -+#include -+ -+/* Tru64 with Desktop Toolkit C has a bug: must be included before -+ . -+ BSD/OS 4.1 has a bug: and must be included before -+ . */ -+#include -+#include -+#include -+ -+#include "mbchar.h" -+ -+#ifndef _GL_INLINE_HEADER_BEGIN -+ #error "Please include config.h first." -+#endif -+_GL_INLINE_HEADER_BEGIN -+#ifndef MBFILE_INLINE -+# define MBFILE_INLINE _GL_INLINE -+#endif -+ -+struct mbfile_multi { -+ FILE *fp; -+ bool eof_seen; -+ bool have_pushback; -+ mbstate_t state; -+ unsigned int bufcount; -+ char buf[MBCHAR_BUF_SIZE]; -+ struct mbchar pushback; -+}; -+ -+MBFILE_INLINE void -+mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) -+{ -+ size_t bytes; -+ -+ /* If EOF has already been seen, don't use getc. This matters if -+ mbf->fp is connected to an interactive tty. */ -+ if (mbf->eof_seen) -+ goto eof; -+ -+ /* Return character pushed back, if there is one. */ -+ if (mbf->have_pushback) -+ { -+ mb_copy (mbc, &mbf->pushback); -+ mbf->have_pushback = false; -+ return; -+ } -+ -+ /* Before using mbrtowc, we need at least one byte. */ -+ if (mbf->bufcount == 0) -+ { -+ int c = getc (mbf->fp); -+ if (c == EOF) -+ { -+ mbf->eof_seen = true; -+ goto eof; -+ } -+ mbf->buf[0] = (unsigned char) c; -+ mbf->bufcount++; -+ } -+ -+ /* Handle most ASCII characters quickly, without calling mbrtowc(). */ -+ if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0])) -+ { -+ /* These characters are part of the basic character set. ISO C 99 -+ guarantees that their wide character code is identical to their -+ char code. */ -+ mbc->wc = mbc->buf[0] = mbf->buf[0]; -+ mbc->wc_valid = true; -+ mbc->ptr = &mbc->buf[0]; -+ mbc->bytes = 1; -+ mbf->bufcount = 0; -+ return; -+ } -+ -+ /* Use mbrtowc on an increasing number of bytes. Read only as many bytes -+ from mbf->fp as needed. This is needed to give reasonable interactive -+ behaviour when mbf->fp is connected to an interactive tty. */ -+ for (;;) -+ { -+ /* We don't know whether the 'mbrtowc' function updates the state when -+ it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or -+ not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We -+ don't have an autoconf test for this, yet. -+ The new behaviour would allow us to feed the bytes one by one into -+ mbrtowc. But the old behaviour forces us to feed all bytes since -+ the end of the last character into mbrtowc. Since we want to retry -+ with more bytes when mbrtowc returns -2, we must backup the state -+ before calling mbrtowc, because implementations with the new -+ behaviour will clobber it. */ -+ mbstate_t backup_state = mbf->state; -+ -+ bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); -+ -+ if (bytes == (size_t) -1) -+ { -+ /* An invalid multibyte sequence was encountered. */ -+ /* Return a single byte. */ -+ bytes = 1; -+ mbc->wc_valid = false; -+ break; -+ } -+ else if (bytes == (size_t) -2) -+ { -+ /* An incomplete multibyte character. */ -+ mbf->state = backup_state; -+ if (mbf->bufcount == MBCHAR_BUF_SIZE) -+ { -+ /* An overlong incomplete multibyte sequence was encountered. */ -+ /* Return a single byte. */ -+ bytes = 1; -+ mbc->wc_valid = false; -+ break; -+ } -+ else -+ { -+ /* Read one more byte and retry mbrtowc. */ -+ int c = getc (mbf->fp); -+ if (c == EOF) -+ { -+ /* An incomplete multibyte character at the end. */ -+ mbf->eof_seen = true; -+ bytes = mbf->bufcount; -+ mbc->wc_valid = false; -+ break; -+ } -+ mbf->buf[mbf->bufcount] = (unsigned char) c; -+ mbf->bufcount++; -+ } -+ } -+ else -+ { -+ if (bytes == 0) -+ { -+ /* A null wide character was encountered. */ -+ bytes = 1; -+ assert (mbf->buf[0] == '\0'); -+ assert (mbc->wc == 0); -+ } -+ mbc->wc_valid = true; -+ break; -+ } -+ } -+ -+ /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ -+ mbc->ptr = &mbc->buf[0]; -+ memcpy (&mbc->buf[0], &mbf->buf[0], bytes); -+ mbc->bytes = bytes; -+ -+ mbf->bufcount -= bytes; -+ if (mbf->bufcount > 0) -+ { -+ /* It's not worth calling memmove() for so few bytes. */ -+ unsigned int count = mbf->bufcount; -+ char *p = &mbf->buf[0]; -+ -+ do -+ { -+ *p = *(p + bytes); -+ p++; -+ } -+ while (--count > 0); -+ } -+ return; -+ -+eof: -+ /* An mbchar_t with bytes == 0 is used to indicate EOF. */ -+ mbc->ptr = NULL; -+ mbc->bytes = 0; -+ mbc->wc_valid = false; -+ return; -+} -+ -+MBFILE_INLINE void -+mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf) -+{ -+ mb_copy (&mbf->pushback, mbc); -+ mbf->have_pushback = true; -+} -+ -+typedef struct mbfile_multi mb_file_t; -+ -+typedef mbchar_t mbf_char_t; -+ -+#define mbf_init(mbf, stream) \ -+ ((mbf).fp = (stream), \ -+ (mbf).eof_seen = false, \ -+ (mbf).have_pushback = false, \ -+ memset (&(mbf).state, '\0', sizeof (mbstate_t)), \ -+ (mbf).bufcount = 0) -+ -+#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf)) -+ -+#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf)) -+ -+#define mb_iseof(mbc) ((mbc).bytes == 0) -+ -+#ifndef _GL_INLINE_HEADER_BEGIN -+ #error "Please include config.h first." -+#endif -+_GL_INLINE_HEADER_BEGIN -+ -+#endif /* _MBFILE_H */ -diff --git a/m4/mbfile.m4 b/m4/mbfile.m4 -new file mode 100644 -index 0000000..8589902 ---- /dev/null -+++ b/m4/mbfile.m4 -@@ -0,0 +1,14 @@ -+# mbfile.m4 serial 7 -+dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc. -+dnl This file is free software; the Free Software Foundation -+dnl gives unlimited permission to copy and/or distribute it, -+dnl with or without modifications, as long as this notice is preserved. -+ -+dnl autoconf tests required for use of mbfile.h -+dnl From Bruno Haible. -+ -+AC_DEFUN([gl_MBFILE], -+[ -+ AC_REQUIRE([AC_TYPE_MBSTATE_T]) -+ : -+]) -diff --git a/src/expand.c b/src/expand.c -index 9fa2e10..380e020 100644 ---- a/src/expand.c -+++ b/src/expand.c -@@ -37,6 +37,9 @@ - #include - #include - #include -+ -+#include -+ - #include "system.h" - #include "die.h" - #include "xstrndup.h" -@@ -100,19 +103,19 @@ expand (void) - { - /* Input stream. */ - FILE *fp = next_file (NULL); -+ mb_file_t mbf; -+ mbf_char_t c; - - if (!fp) - return; - -+ mbf_init (mbf, fp); -+ - while (true) - { -- /* Input character, or EOF. */ -- int c; -- - /* If true, perform translations. */ - bool convert = true; - -- - /* The following variables have valid values only when CONVERT - is true: */ - -@@ -122,17 +125,23 @@ expand (void) - /* Index in TAB_LIST of next tab stop to examine. */ - size_t tab_index = 0; - -- - /* Convert a line of text. */ - - do - { -- while ((c = getc (fp)) < 0 && (fp = next_file (fp))) -- continue; -+ do { -+ mbf_getc (c, mbf); -+ if (mb_iseof (c)) -+ { -+ mbf_init (mbf, fp = next_file (fp)); -+ continue; -+ } -+ } -+ while (false); - - if (convert) - { -- if (c == '\t') -+ if (mb_iseq (c, '\t')) - { - /* Column the next input tab stop is on. */ - uintmax_t next_tab_column; -@@ -151,32 +160,34 @@ expand (void) - if (putchar (' ') < 0) - die (EXIT_FAILURE, errno, _("write error")); - -- c = ' '; -+ mb_setascii (&c, ' '); - } -- else if (c == '\b') -+ else if (mb_iseq (c, '\b')) - { - /* Go back one column, and force recalculation of the - next tab stop. */ - column -= !!column; - tab_index -= !!tab_index; - } -- else -+ /* A leading control character could make us trip over. */ -+ else if (!mb_iscntrl (c)) - { -- column++; -+ column += mb_width (c); - if (!column) - die (EXIT_FAILURE, 0, _("input line is too long")); - } - -- convert &= convert_entire_line || !! isblank (c); -+ convert &= convert_entire_line || mb_isblank (c); - } - -- if (c < 0) -+ if (mb_iseof (c)) - return; - -- if (putchar (c) < 0) -+ mb_putc (c, stdout); -+ if (ferror (stdout)) - die (EXIT_FAILURE, errno, _("write error")); - } -- while (c != '\n'); -+ while (!mb_iseq (c, '\n')); - } - } - -diff --git a/src/unexpand.c b/src/unexpand.c -index 7801274..569a7ee 100644 ---- a/src/unexpand.c -+++ b/src/unexpand.c -@@ -38,6 +38,9 @@ - #include - #include - #include -+ -+#include -+ - #include "system.h" - #include "die.h" - #include "xstrndup.h" -@@ -107,11 +110,12 @@ unexpand (void) - { - /* Input stream. */ - FILE *fp = next_file (NULL); -+ mb_file_t mbf; - - /* The array of pending blanks. In non-POSIX locales, blanks can - include characters other than spaces, so the blanks must be - stored, not merely counted. */ -- char *pending_blank; -+ mbf_char_t *pending_blank; - - if (!fp) - return; -@@ -119,12 +123,14 @@ unexpand (void) - /* The worst case is a non-blank character, then one blank, then a - tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so - allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ -- pending_blank = xmalloc (max_column_width); -+ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); -+ -+ mbf_init (mbf, fp); - - while (true) - { - /* Input character, or EOF. */ -- int c; -+ mbf_char_t c; - - /* If true, perform translations. */ - bool convert = true; -@@ -158,12 +164,19 @@ unexpand (void) - - do - { -- while ((c = getc (fp)) < 0 && (fp = next_file (fp))) -- continue; -+ do { -+ mbf_getc (c, mbf); -+ if (mb_iseof (c)) -+ { -+ mbf_init (mbf, fp = next_file (fp)); -+ continue; -+ } -+ } -+ while (false); - - if (convert) - { -- bool blank = !! isblank (c); -+ bool blank = mb_isblank (c); - - if (blank) - { -@@ -180,16 +193,16 @@ unexpand (void) - if (next_tab_column < column) - die (EXIT_FAILURE, 0, _("input line is too long")); - -- if (c == '\t') -+ if (mb_iseq (c, '\t')) - { - column = next_tab_column; - - if (pending) -- pending_blank[0] = '\t'; -+ mb_setascii (&pending_blank[0], '\t'); - } - else - { -- column++; -+ column += mb_width (c); - - if (! (prev_blank && column == next_tab_column)) - { -@@ -197,13 +210,14 @@ unexpand (void) - will be replaced by tabs. */ - if (column == next_tab_column) - one_blank_before_tab_stop = true; -- pending_blank[pending++] = c; -+ mb_copy (&pending_blank[pending++], &c); - prev_blank = true; - continue; - } - - /* Replace the pending blanks by a tab or two. */ -- pending_blank[0] = c = '\t'; -+ mb_setascii (&c, '\t'); -+ mb_setascii (&pending_blank[0], '\t'); - } - - /* Discard pending blanks, unless it was a single -@@ -211,7 +225,7 @@ unexpand (void) - pending = one_blank_before_tab_stop; - } - } -- else if (c == '\b') -+ else if (mb_iseq (c, '\b')) - { - /* Go back one column, and force recalculation of the - next tab stop. */ -@@ -221,7 +235,7 @@ unexpand (void) - } - else - { -- column++; -+ column += mb_width (c); - if (!column) - die (EXIT_FAILURE, 0, _("input line is too long")); - } -@@ -229,8 +243,11 @@ unexpand (void) - if (pending) - { - if (pending > 1 && one_blank_before_tab_stop) -- pending_blank[0] = '\t'; -- if (fwrite (pending_blank, 1, pending, stdout) != pending) -+ mb_setascii (&pending_blank[0], '\t'); -+ -+ for (int n = 0; n < pending; ++n) -+ mb_putc (pending_blank[n], stdout); -+ if (ferror (stdout)) - die (EXIT_FAILURE, errno, _("write error")); - pending = 0; - one_blank_before_tab_stop = false; -@@ -240,16 +257,17 @@ unexpand (void) - convert &= convert_entire_line || blank; - } - -- if (c < 0) -+ if (mb_iseof (c)) - { - free (pending_blank); - return; - } - -- if (putchar (c) < 0) -+ mb_putc (c, stdout); -+ if (ferror (stdout)) - die (EXIT_FAILURE, errno, _("write error")); - } -- while (c != '\n'); -+ while (!mb_iseq (c, '\n')); - } - } - -diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh -new file mode 100755 -index 0000000..7971e18 ---- /dev/null -+++ b/tests/expand/mb.sh -@@ -0,0 +1,98 @@ -+#!/bin/sh -+ -+# Copyright (C) 2012-2015 Free Software Foundation, Inc. -+ -+# This program is free software: you can redistribute it and/or modify -+# it under the terms of the GNU General Public License as published by -+# the Free Software Foundation, either version 3 of the License, or -+# (at your option) any later version. -+ -+# This program is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+# GNU General Public License for more details. -+ -+# You should have received a copy of the GNU General Public License -+# along with this program. If not, see . -+ -+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src -+print_ver_ expand -+ -+export LC_ALL=en_US.UTF-8 -+ -+#input containing multibyte characters -+cat <<\EOF > in || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ -+ -+cat <<\EOF > exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#test characters with display widths != 1 -+env printf '12345678 -+e\t|ascii(1) -+\u00E9\t|composed(1) -+e\u0301\t|decomposed(1) -+\u3000\t|ideo-space(2) -+\uFF0D\t|full-hypen(2) -+' > in || framework_failure_ -+ -+env printf '12345678 -+e |ascii(1) -+\u00E9 |composed(1) -+e\u0301 |decomposed(1) -+\u3000 |ideo-space(2) -+\uFF0D |full-hypen(2) -+' > exp || framework_failure_ -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#shouldn't fail with "input line too long" -+#when a line starts with a control character -+env printf '\n' > in || framework_failure_ -+ -+expand < in > out || fail=1 -+compare in out > /dev/null 2>&1 || fail=1 -+ -+#non-Unicode characters interspersed between Unicode ones -+env printf '12345678 -+\t\xFF| -+\xFF\t| -+\t\xFFä| -+ä\xFF\t| -+\tä\xFF| -+\xFF\tä| -+äbcdef\xFF\t| -+' > in || framework_failure_ -+ -+env printf '12345678 -+ \xFF| -+\xFF | -+ \xFFä| -+ä\xFF | -+ ä\xFF| -+\xFF ä| -+äbcdef\xFF | -+' > exp || framework_failure_ -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+exit $fail -diff --git a/tests/local.mk b/tests/local.mk -index 192f776..8053397 100644 ---- a/tests/local.mk -+++ b/tests/local.mk -@@ -544,6 +544,7 @@ all_tests = \ - tests/du/threshold.sh \ - tests/du/trailing-slash.sh \ - tests/du/two-args.sh \ -+ tests/expand/mb.sh \ - tests/id/gnu-zero-uids.sh \ - tests/id/no-context.sh \ - tests/id/context.sh \ -@@ -684,6 +685,7 @@ all_tests = \ - tests/touch/read-only.sh \ - tests/touch/relative.sh \ - tests/touch/trailing-slash.sh \ -+ tests/unexpand/mb.sh \ - $(all_root_tests) - - # See tests/factor/create-test.sh. -diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh -new file mode 100755 -index 0000000..60d4c1a ---- /dev/null -+++ b/tests/unexpand/mb.sh -@@ -0,0 +1,97 @@ -+#!/bin/sh -+ -+# Copyright (C) 2012-2015 Free Software Foundation, Inc. -+ -+# This program is free software: you can redistribute it and/or modify -+# it under the terms of the GNU General Public License as published by -+# the Free Software Foundation, either version 3 of the License, or -+# (at your option) any later version. -+ -+# This program is distributed in the hope that it will be useful, -+# but WITHOUT ANY WARRANTY; without even the implied warranty of -+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+# GNU General Public License for more details. -+ -+# You should have received a copy of the GNU General Public License -+# along with this program. If not, see . -+ -+. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src -+print_ver_ unexpand -+ -+export LC_ALL=en_US.UTF-8 -+ -+#input containing multibyte characters -+cat > in <<\EOF -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+cat > exp <<\EOF -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#test characters with a display width larger than 1 -+ -+env printf '12345678 -+e |ascii(1) -+\u00E9 |composed(1) -+e\u0301 |decomposed(1) -+\u3000 |ideo-space(2) -+\uFF0D |full-hypen(2) -+' > in || framework_failure_ -+ -+env printf '12345678 -+e\t|ascii(1) -+\u00E9\t|composed(1) -+e\u0301\t|decomposed(1) -+\u3000\t|ideo-space(2) -+\uFF0D\t|full-hypen(2) -+' > exp || framework_failure_ -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#test input where a blank of width > 1 is not being substituted -+in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')" -+exp='   ö ü ß' -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+#non-Unicode characters interspersed between Unicode ones -+env printf '12345678 -+ \xFF| -+\xFF | -+ \xFFä| -+ä\xFF | -+ ä\xFF| -+\xFF ä| -+äbcdef\xFF | -+' > in || framework_failure_ -+ -+env printf '12345678 -+\t\xFF| -+\xFF\t| -+\t\xFFä| -+ä\xFF\t| -+\tä\xFF| -+\xFF\tä| -+äbcdef\xFF\t| -+' > exp || framework_failure_ -+ -+unexpand -a < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 --- -2.7.4 - diff --git a/coreutils/patches/coreutils-i18n-fix-unexpand.patch b/coreutils/patches/coreutils-i18n-fix-unexpand.patch deleted file mode 100644 index f0c347cd2..000000000 --- a/coreutils/patches/coreutils-i18n-fix-unexpand.patch +++ /dev/null @@ -1,28 +0,0 @@ -From 02424bfcd719bbaa695f4e1c3ef17ad91b0d23c0 Mon Sep 17 00:00:00 2001 -From: Lubomir Rintel -Date: Thu, 28 Jan 2016 20:57:22 +0100 -Subject: [PATCH] unexpand: fix blank line handling - - echo '' |./src/unexpand -a - -Really? ---- - src/unexpand.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/src/unexpand.c b/src/unexpand.c -index 569a7ee..3bbbd66 100644 ---- a/src/unexpand.c -+++ b/src/unexpand.c -@@ -233,7 +233,7 @@ unexpand (void) - next_tab_column = column; - tab_index -= !!tab_index; - } -- else -+ else if (!mb_iseq (c, '\n')) - { - column += mb_width (c); - if (!column) --- -2.7.4 - diff --git a/coreutils/patches/coreutils-i18n-fix2-expand-unexpand.patch b/coreutils/patches/coreutils-i18n-fix2-expand-unexpand.patch deleted file mode 100644 index b34d7b74f..000000000 --- a/coreutils/patches/coreutils-i18n-fix2-expand-unexpand.patch +++ /dev/null @@ -1,108 +0,0 @@ -diff --git a/src/expand.c b/src/expand.c -index 380e020..310b349 100644 ---- a/src/expand.c -+++ b/src/expand.c -@@ -129,15 +129,19 @@ expand (void) - - do - { -- do { -+ while (true) { - mbf_getc (c, mbf); -- if (mb_iseof (c)) -+ if ((mb_iseof (c)) && (fp = next_file (fp))) - { -- mbf_init (mbf, fp = next_file (fp)); -+ mbf_init (mbf, fp); - continue; - } -+ else -+ { -+ break; -+ } - } -- while (false); -+ - - if (convert) - { -diff --git a/src/unexpand.c b/src/unexpand.c -index 3bbbd66..863a90a 100644 ---- a/src/unexpand.c -+++ b/src/unexpand.c -@@ -164,15 +164,19 @@ unexpand (void) - - do - { -- do { -+ while (true) { - mbf_getc (c, mbf); -- if (mb_iseof (c)) -+ if ((mb_iseof (c)) && (fp = next_file (fp))) - { -- mbf_init (mbf, fp = next_file (fp)); -+ mbf_init (mbf, fp); - continue; - } -+ else -+ { -+ break; -+ } - } -- while (false); -+ - - if (convert) - { -diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh -index 7971e18..031be7a 100755 ---- a/tests/expand/mb.sh -+++ b/tests/expand/mb.sh -@@ -44,6 +44,20 @@ EOF - expand < in > out || fail=1 - compare exp out > /dev/null 2>&1 || fail=1 - -+#multiple files as an input -+cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+expand ./in ./in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ - #test characters with display widths != 1 - env printf '12345678 - e\t|ascii(1) -diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh -index 60d4c1a..8d75652 100755 ---- a/tests/unexpand/mb.sh -+++ b/tests/unexpand/mb.sh -@@ -44,6 +44,22 @@ EOF - unexpand -a < in > out || fail=1 - compare exp out > /dev/null 2>&1 || fail=1 - -+ -+#multiple files as an input -+cat >> exp <<\EOF -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+ -+unexpand -a ./in ./in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ - #test characters with a display width larger than 1 - - env printf '12345678 diff --git a/coreutils/patches/coreutils-i18n-sort-human.patch b/coreutils/patches/coreutils-i18n-sort-human.patch deleted file mode 100644 index 675249315..000000000 --- a/coreutils/patches/coreutils-i18n-sort-human.patch +++ /dev/null @@ -1,35 +0,0 @@ -From 3976ef5a20369d8b490907ab2cba2d617305a5e0 Mon Sep 17 00:00:00 2001 -From: Kamil Dudka -Date: Mon, 30 May 2016 16:19:20 +0200 -Subject: [PATCH] sort: do not use static array 'blanks' in human_numcompare() - -... because the array is not initialized with MB locales. Note this is -rather a conservative fix. I plan to do more cleanup of the i18n patch -in Fedora to prevent mistakes like this in future updates of coreutils. ---- - src/sort.c | 8 +++----- - 1 file changed, 3 insertions(+), 5 deletions(-) - -diff --git a/src/sort.c b/src/sort.c -index 9e07ad8..e47b039 100644 ---- a/src/sort.c -+++ b/src/sort.c -@@ -2304,12 +2304,10 @@ find_unit_order (char const *number) - < K/k < M < G < T < P < E < Z < Y */ - - static int --human_numcompare (char const *a, char const *b) -+human_numcompare (char *a, char *b) - { -- while (blanks[to_uchar (*a)]) -- a++; -- while (blanks[to_uchar (*b)]) -- b++; -+ skipblanks(&a, a + strlen(a)); -+ skipblanks(&b, b + strlen(b)); - - int diff = find_unit_order (a) - find_unit_order (b); - return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); --- -2.5.5 - diff --git a/coreutils/patches/coreutils-i18n-un-expand-BOM.patch b/coreutils/patches/coreutils-i18n-un-expand-BOM.patch deleted file mode 100644 index 6210ce7e8..000000000 --- a/coreutils/patches/coreutils-i18n-un-expand-BOM.patch +++ /dev/null @@ -1,456 +0,0 @@ -From 7a7c776a4e228d180e74614fd8c8afcad5d4bdf7 Mon Sep 17 00:00:00 2001 -From: Jakub Martisko -Date: Thu, 7 Jul 2016 12:53:26 +0200 -Subject: [PATCH] coreutils-i18n-un-expand-BOM.patch - ---- - src/expand-common.c | 114 +++++++++++++++++++++++++++++++++++++++++++++++++++ - src/expand-common.h | 12 ++++++ - src/expand.c | 45 +++++++++++++++++++- - src/unexpand.c | 43 ++++++++++++++++++- - tests/expand/mb.sh | 71 ++++++++++++++++++++++++++++++++ - tests/unexpand/mb.sh | 59 ++++++++++++++++++++++++++ - 6 files changed, 342 insertions(+), 2 deletions(-) - -diff --git a/src/expand-common.c b/src/expand-common.c -index 4657e46..97cbb09 100644 ---- a/src/expand-common.c -+++ b/src/expand-common.c -@@ -19,6 +19,7 @@ - #include - #include - #include -+#include - #include "system.h" - #include "die.h" - #include "error.h" -@@ -126,6 +127,119 @@ set_increment_size (uintmax_t tabval) - return ok; - } - -+extern int -+set_utf_locale (void) -+{ -+ /*try using some predefined locale */ -+ const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"}; -+ -+ const int predef_locales_count=3; -+ for (int i=0;ibufcount=0; -+ if (c == 0xEF) -+ { -+ c=fgetc(fp); -+ } -+ else -+ { -+ if (c != EOF) -+ { -+ ungetc(c,fp); -+ } -+ return false; -+ } -+ -+ if (c == 0xBB) -+ { -+ c=fgetc(fp); -+ } -+ else -+ { -+ if ( c!= EOF ) -+ { -+ mbf->buf[0]=(unsigned char) 0xEF; -+ mbf->bufcount=1; -+ ungetc(c,fp); -+ return false; -+ } -+ else -+ { -+ ungetc(0xEF,fp); -+ return false; -+ } -+ } -+ if (c == 0xBF) -+ { -+ mbf->bufcount=0; -+ return true; -+ } -+ else -+ { -+ if (c != EOF) -+ { -+ mbf->buf[0]=(unsigned char) 0xEF; -+ mbf->buf[1]=(unsigned char) 0xBB; -+ mbf->bufcount=2; -+ ungetc(c,fp); -+ return false; -+ } -+ else -+ { -+ mbf->buf[0]=(unsigned char) 0xEF; -+ mbf->bufcount=1; -+ ungetc(0xBB,fp); -+ return false; -+ } -+ } -+ return false; -+} -+ -+extern void -+print_bom(void) -+{ -+ putc (0xEF, stdout); -+ putc (0xBB, stdout); -+ putc (0xBF, stdout); -+} -+ - /* Add the comma or blank separated list of tab stops STOPS - to the list of tab stops. */ - extern void -diff --git a/src/expand-common.h b/src/expand-common.h -index 8cb2079..763bfda 100644 ---- a/src/expand-common.h -+++ b/src/expand-common.h -@@ -34,6 +34,18 @@ extern size_t max_column_width; - /* The desired exit status. */ - extern int exit_status; - -+extern int -+set_utf_locale (void); -+ -+extern bool -+check_utf_locale(void); -+ -+extern bool -+check_bom(FILE* fp, mb_file_t *mbf); -+ -+extern void -+print_bom(void); -+ - /* Add tab stop TABVAL to the end of 'tab_list'. */ - extern void - add_tab_stop (uintmax_t tabval); -diff --git a/src/expand.c b/src/expand.c -index 310b349..4136824 100644 ---- a/src/expand.c -+++ b/src/expand.c -@@ -103,11 +103,33 @@ expand (void) - FILE *fp = next_file (NULL); - mb_file_t mbf; - mbf_char_t c; -+ /* True if the starting locale is utf8. */ -+ bool using_utf_locale; -+ -+ /* True if the first file contains BOM header. */ -+ bool found_bom; -+ using_utf_locale=check_utf_locale(); - - if (!fp) - return; -- - mbf_init (mbf, fp); -+ found_bom=check_bom(fp,&mbf); -+ -+ if (using_utf_locale == false && found_bom == true) -+ { -+ /*try using some predefined locale */ -+ -+ if (set_utf_locale () != 0) -+ { -+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); -+ } -+ } -+ -+ -+ if (found_bom == true) -+ { -+ print_bom(); -+ } - - while (true) - { -@@ -132,6 +154,27 @@ expand (void) - if ((mb_iseof (c)) && (fp = next_file (fp))) - { - mbf_init (mbf, fp); -+ if (fp!=NULL) -+ { -+ if (check_bom(fp,&mbf)==true) -+ { -+ /*Not the first file - check BOM header*/ -+ if (using_utf_locale==false && found_bom==false) -+ { -+ /*BOM header in subsequent file but not in the first one. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ else -+ { -+ if(using_utf_locale==false && found_bom==true) -+ { -+ /*First file conatined BOM header - locale was switched to UTF -+ *all subsequent files should contain BOM. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ } - continue; - } - else -diff --git a/src/unexpand.c b/src/unexpand.c -index 863a90a..5681b58 100644 ---- a/src/unexpand.c -+++ b/src/unexpand.c -@@ -116,16 +116,36 @@ unexpand (void) - include characters other than spaces, so the blanks must be - stored, not merely counted. */ - mbf_char_t *pending_blank; -+ /* True if the starting locale is utf8. */ -+ bool using_utf_locale; -+ -+ /* True if the first file contains BOM header. */ -+ bool found_bom; -+ using_utf_locale=check_utf_locale(); - - if (!fp) - return; -+ mbf_init (mbf, fp); -+ found_bom=check_bom(fp,&mbf); -+ -+ if (using_utf_locale == false && found_bom == true) -+ { -+ /*try using some predefined locale */ - -+ if (set_utf_locale () != 0) -+ { -+ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); -+ } -+ } - /* The worst case is a non-blank character, then one blank, then a - tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so - allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ - pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); - -- mbf_init (mbf, fp); -+ if (found_bom == true) -+ { -+ print_bom(); -+ } - - while (true) - { -@@ -169,6 +189,27 @@ unexpand (void) - if ((mb_iseof (c)) && (fp = next_file (fp))) - { - mbf_init (mbf, fp); -+ if (fp!=NULL) -+ { -+ if (check_bom(fp,&mbf)==true) -+ { -+ /*Not the first file - check BOM header*/ -+ if (using_utf_locale==false && found_bom==false) -+ { -+ /*BOM header in subsequent file but not in the first one. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ else -+ { -+ if(using_utf_locale==false && found_bom==true) -+ { -+ /*First file conatined BOM header - locale was switched to UTF -+ *all subsequent files should contain BOM. */ -+ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); -+ } -+ } -+ } - continue; - } - else -diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh -index 031be7a..1621c84 100755 ---- a/tests/expand/mb.sh -+++ b/tests/expand/mb.sh -@@ -109,4 +109,75 @@ env printf '12345678 - expand < in > out || fail=1 - compare exp out > /dev/null 2>&1 || fail=1 - -+ -+ -+#BOM header test 1 -+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ -+ -+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+ -+expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C expand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+ -+printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ -+ -+ -+printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+expand in1 in1 > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C expand in1 in1 > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C expand in1 in1 > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ - exit $fail -diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh -index 8d75652..9d4ee3e 100755 ---- a/tests/unexpand/mb.sh -+++ b/tests/unexpand/mb.sh -@@ -111,3 +111,62 @@ env printf '12345678 - - unexpand -a < in > out || fail=1 - compare exp out > /dev/null 2>&1 || fail=1 -+ -+#BOM header test 1 -+printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ -+ -+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+unexpand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C unexpand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C unexpand < in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+ -+printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+1234567812345678123456781 -+. . . . -+a b c d -+. . . . -+ä ö ü ß -+. . . . -+ äöü . öüä. ä xx -+EOF -+ -+ -+unexpand in in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LANG=C unexpand in in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 -+ -+LC_ALL=C unexpand in in > out || fail=1 -+compare exp out > /dev/null 2>&1 || fail=1 --- -2.9.3 - diff --git a/coreutils/patches/coreutils-i18n.patch b/coreutils/patches/coreutils-i18n.patch index e3428d933..2dbcb6e02 100644 --- a/coreutils/patches/coreutils-i18n.patch +++ b/coreutils/patches/coreutils-i18n.patch @@ -1,37 +1,87 @@ -From 29117b2d07af00f4d4b87cf778e4294588ab1a83 Mon Sep 17 00:00:00 2001 -From: Kamil Dudka -Date: Thu, 1 Dec 2016 15:10:04 +0100 +Submitted by: Xi Ruoyao +Date: 2022-04-19 +Initial Package Version: 9.1 +Upstream Status: Rejected +Origin: https://src.fedoraproject.org/rpms/coreutils/raw/9325dbb/f/coreutils-i18n.patch +Description: Fixes i18n issues with various Coreutils programs + +From 01010419a6499768563e7b2f3fd56cf16edda75e Mon Sep 17 00:00:00 2001 +From: rpm-build +Date: Mon, 4 Oct 2021 08:54:37 +0200 Subject: [PATCH] coreutils-i18n.patch -TODO: merge upstream --- + bootstrap.conf | 1 + + configure.ac | 2 + lib/linebuffer.h | 8 + - src/fold.c | 308 ++++++++++++++++-- - src/join.c | 359 ++++++++++++++++++--- - src/pr.c | 443 ++++++++++++++++++++++--- - src/sort.c | 764 +++++++++++++++++++++++++++++++++++++++++--- - src/uniq.c | 265 ++++++++++++++- + lib/mbfile.c | 3 + + lib/mbfile.h | 255 ++++++++++++ + m4/mbfile.m4 | 14 + + src/cut.c | 508 +++++++++++++++++++++-- + src/expand-common.c | 114 ++++++ + src/expand-common.h | 12 + + src/expand.c | 90 +++- + src/fold.c | 312 ++++++++++++-- + src/join.c | 359 ++++++++++++++-- + src/local.mk | 4 +- + src/pr.c | 443 ++++++++++++++++++-- + src/sort.c | 792 +++++++++++++++++++++++++++++++++--- + src/unexpand.c | 101 ++++- + src/uniq.c | 119 +++++- + tests/Coreutils.pm | 3 + + tests/expand/mb.sh | 183 +++++++++ tests/i18n/sort.sh | 29 ++ - tests/local.mk | 2 + - tests/misc/expand.pl | 42 +++ + tests/local.mk | 4 + + tests/misc/expand.pl | 42 ++ tests/misc/fold.pl | 50 ++- tests/misc/join.pl | 50 +++ - tests/misc/sort-mb-tests.sh | 45 +++ - tests/misc/sort-merge.pl | 42 +++ - tests/misc/sort.pl | 40 ++- - tests/misc/unexpand.pl | 39 +++ - tests/misc/uniq.pl | 55 ++++ + tests/misc/sort-mb-tests.sh | 45 ++ + tests/misc/sort-merge.pl | 42 ++ + tests/misc/sort.pl | 40 +- + tests/misc/unexpand.pl | 39 ++ + tests/misc/uniq.pl | 55 +++ tests/pr/pr-tests.pl | 49 +++ - 17 files changed, 2430 insertions(+), 160 deletions(-) + tests/unexpand/mb.sh | 172 ++++++++ + 31 files changed, 3698 insertions(+), 242 deletions(-) + create mode 100644 lib/mbfile.c + create mode 100644 lib/mbfile.h + create mode 100644 m4/mbfile.m4 + create mode 100755 tests/expand/mb.sh create mode 100755 tests/i18n/sort.sh create mode 100755 tests/misc/sort-mb-tests.sh + create mode 100755 tests/unexpand/mb.sh +diff --git a/bootstrap.conf b/bootstrap.conf +index c1399e3..60b39cf 100644 +--- a/bootstrap.conf ++++ b/bootstrap.conf +@@ -162,6 +162,7 @@ gnulib_modules=" + maintainer-makefile + malloc-gnu + manywarnings ++ mbfile + mbrlen + mbrtowc + mbsalign +diff --git a/configure.ac b/configure.ac +index 7e4afc9..4656a35 100644 +--- a/configure.ac ++++ b/configure.ac +@@ -476,6 +476,8 @@ fi + # I'm leaving it here for now. This whole thing needs to be modernized... + gl_WINSIZE_IN_PTEM + ++gl_MBFILE ++ + gl_HEADER_TIOCGWINSZ_IN_TERMIOS_H + + if test $gl_cv_sys_tiocgwinsz_needs_termios_h = no && \ diff --git a/lib/linebuffer.h b/lib/linebuffer.h -index 64181af..9b8fe5a 100644 +index 07d45ca..af62e6c 100644 --- a/lib/linebuffer.h +++ b/lib/linebuffer.h -@@ -21,6 +21,11 @@ - +@@ -22,6 +22,11 @@ + # include "idx.h" # include +/* Get mbstate_t. */ @@ -41,19 +91,1280 @@ index 64181af..9b8fe5a 100644 + /* A 'struct linebuffer' holds a line of text. */ - struct linebuffer -@@ -28,6 +33,9 @@ struct linebuffer - size_t size; /* Allocated. */ - size_t length; /* Used. */ - char *buffer; -+# if HAVE_WCHAR_H -+ mbstate_t state; -+# endif - }; + struct linebuffer +@@ -29,6 +34,9 @@ struct linebuffer + idx_t size; /* Allocated. */ + idx_t length; /* Used. */ + char *buffer; ++# if HAVE_WCHAR_H ++ mbstate_t state; ++# endif + }; + + /* Initialize linebuffer LINEBUFFER for use. */ +diff --git a/lib/mbfile.c b/lib/mbfile.c +new file mode 100644 +index 0000000..b0a468e +--- /dev/null ++++ b/lib/mbfile.c +@@ -0,0 +1,3 @@ ++#include ++#define MBFILE_INLINE _GL_EXTERN_INLINE ++#include "mbfile.h" +diff --git a/lib/mbfile.h b/lib/mbfile.h +new file mode 100644 +index 0000000..11f1b12 +--- /dev/null ++++ b/lib/mbfile.h +@@ -0,0 +1,255 @@ ++/* Multibyte character I/O: macros for multi-byte encodings. ++ Copyright (C) 2001, 2005, 2009-2015 Free Software Foundation, Inc. ++ ++ This program is free software: you can redistribute it and/or modify ++ it under the terms of the GNU General Public License as published by ++ the Free Software Foundation; either version 3 of the License, or ++ (at your option) any later version. ++ ++ This program is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ GNU General Public License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with this program. If not, see . */ ++ ++/* Written by Mitsuru Chinen ++ and Bruno Haible . */ ++ ++/* The macros in this file implement multi-byte character input from a ++ stream. ++ ++ mb_file_t ++ is the type for multibyte character input stream, usable for variable ++ declarations. ++ ++ mbf_char_t ++ is the type for multibyte character or EOF, usable for variable ++ declarations. ++ ++ mbf_init (mbf, stream) ++ initializes the MB_FILE for reading from stream. ++ ++ mbf_getc (mbc, mbf) ++ reads the next multibyte character from mbf and stores it in mbc. ++ ++ mb_iseof (mbc) ++ returns true if mbc represents the EOF value. ++ ++ Here are the function prototypes of the macros. ++ ++ extern void mbf_init (mb_file_t mbf, FILE *stream); ++ extern void mbf_getc (mbf_char_t mbc, mb_file_t mbf); ++ extern bool mb_iseof (const mbf_char_t mbc); ++ */ ++ ++#ifndef _MBFILE_H ++#define _MBFILE_H 1 ++ ++#include ++#include ++#include ++#include ++ ++/* Tru64 with Desktop Toolkit C has a bug: must be included before ++ . ++ BSD/OS 4.1 has a bug: and must be included before ++ . */ ++#include ++#include ++#include ++ ++#include "mbchar.h" ++ ++#ifndef _GL_INLINE_HEADER_BEGIN ++ #error "Please include config.h first." ++#endif ++_GL_INLINE_HEADER_BEGIN ++#ifndef MBFILE_INLINE ++# define MBFILE_INLINE _GL_INLINE ++#endif ++ ++struct mbfile_multi { ++ FILE *fp; ++ bool eof_seen; ++ bool have_pushback; ++ mbstate_t state; ++ unsigned int bufcount; ++ char buf[MBCHAR_BUF_SIZE]; ++ struct mbchar pushback; ++}; ++ ++MBFILE_INLINE void ++mbfile_multi_getc (struct mbchar *mbc, struct mbfile_multi *mbf) ++{ ++ size_t bytes; ++ ++ /* If EOF has already been seen, don't use getc. This matters if ++ mbf->fp is connected to an interactive tty. */ ++ if (mbf->eof_seen) ++ goto eof; ++ ++ /* Return character pushed back, if there is one. */ ++ if (mbf->have_pushback) ++ { ++ mb_copy (mbc, &mbf->pushback); ++ mbf->have_pushback = false; ++ return; ++ } ++ ++ /* Before using mbrtowc, we need at least one byte. */ ++ if (mbf->bufcount == 0) ++ { ++ int c = getc (mbf->fp); ++ if (c == EOF) ++ { ++ mbf->eof_seen = true; ++ goto eof; ++ } ++ mbf->buf[0] = (unsigned char) c; ++ mbf->bufcount++; ++ } ++ ++ /* Handle most ASCII characters quickly, without calling mbrtowc(). */ ++ if (mbf->bufcount == 1 && mbsinit (&mbf->state) && is_basic (mbf->buf[0])) ++ { ++ /* These characters are part of the basic character set. ISO C 99 ++ guarantees that their wide character code is identical to their ++ char code. */ ++ mbc->wc = mbc->buf[0] = mbf->buf[0]; ++ mbc->wc_valid = true; ++ mbc->ptr = &mbc->buf[0]; ++ mbc->bytes = 1; ++ mbf->bufcount = 0; ++ return; ++ } ++ ++ /* Use mbrtowc on an increasing number of bytes. Read only as many bytes ++ from mbf->fp as needed. This is needed to give reasonable interactive ++ behaviour when mbf->fp is connected to an interactive tty. */ ++ for (;;) ++ { ++ /* We don't know whether the 'mbrtowc' function updates the state when ++ it returns -2, - this is the ISO C 99 and glibc-2.2 behaviour - or ++ not - amended ANSI C, glibc-2.1 and Solaris 2.7 behaviour. We ++ don't have an autoconf test for this, yet. ++ The new behaviour would allow us to feed the bytes one by one into ++ mbrtowc. But the old behaviour forces us to feed all bytes since ++ the end of the last character into mbrtowc. Since we want to retry ++ with more bytes when mbrtowc returns -2, we must backup the state ++ before calling mbrtowc, because implementations with the new ++ behaviour will clobber it. */ ++ mbstate_t backup_state = mbf->state; ++ ++ bytes = mbrtowc (&mbc->wc, &mbf->buf[0], mbf->bufcount, &mbf->state); ++ ++ if (bytes == (size_t) -1) ++ { ++ /* An invalid multibyte sequence was encountered. */ ++ /* Return a single byte. */ ++ bytes = 1; ++ mbc->wc_valid = false; ++ break; ++ } ++ else if (bytes == (size_t) -2) ++ { ++ /* An incomplete multibyte character. */ ++ mbf->state = backup_state; ++ if (mbf->bufcount == MBCHAR_BUF_SIZE) ++ { ++ /* An overlong incomplete multibyte sequence was encountered. */ ++ /* Return a single byte. */ ++ bytes = 1; ++ mbc->wc_valid = false; ++ break; ++ } ++ else ++ { ++ /* Read one more byte and retry mbrtowc. */ ++ int c = getc (mbf->fp); ++ if (c == EOF) ++ { ++ /* An incomplete multibyte character at the end. */ ++ mbf->eof_seen = true; ++ bytes = mbf->bufcount; ++ mbc->wc_valid = false; ++ break; ++ } ++ mbf->buf[mbf->bufcount] = (unsigned char) c; ++ mbf->bufcount++; ++ } ++ } ++ else ++ { ++ if (bytes == 0) ++ { ++ /* A null wide character was encountered. */ ++ bytes = 1; ++ assert (mbf->buf[0] == '\0'); ++ assert (mbc->wc == 0); ++ } ++ mbc->wc_valid = true; ++ break; ++ } ++ } ++ ++ /* Return the multibyte sequence mbf->buf[0..bytes-1]. */ ++ mbc->ptr = &mbc->buf[0]; ++ memcpy (&mbc->buf[0], &mbf->buf[0], bytes); ++ mbc->bytes = bytes; ++ ++ mbf->bufcount -= bytes; ++ if (mbf->bufcount > 0) ++ { ++ /* It's not worth calling memmove() for so few bytes. */ ++ unsigned int count = mbf->bufcount; ++ char *p = &mbf->buf[0]; ++ ++ do ++ { ++ *p = *(p + bytes); ++ p++; ++ } ++ while (--count > 0); ++ } ++ return; ++ ++eof: ++ /* An mbchar_t with bytes == 0 is used to indicate EOF. */ ++ mbc->ptr = NULL; ++ mbc->bytes = 0; ++ mbc->wc_valid = false; ++ return; ++} ++ ++MBFILE_INLINE void ++mbfile_multi_ungetc (const struct mbchar *mbc, struct mbfile_multi *mbf) ++{ ++ mb_copy (&mbf->pushback, mbc); ++ mbf->have_pushback = true; ++} ++ ++typedef struct mbfile_multi mb_file_t; ++ ++typedef mbchar_t mbf_char_t; ++ ++#define mbf_init(mbf, stream) \ ++ ((mbf).fp = (stream), \ ++ (mbf).eof_seen = false, \ ++ (mbf).have_pushback = false, \ ++ memset (&(mbf).state, '\0', sizeof (mbstate_t)), \ ++ (mbf).bufcount = 0) ++ ++#define mbf_getc(mbc, mbf) mbfile_multi_getc (&(mbc), &(mbf)) ++ ++#define mbf_ungetc(mbc, mbf) mbfile_multi_ungetc (&(mbc), &(mbf)) ++ ++#define mb_iseof(mbc) ((mbc).bytes == 0) ++ ++#ifndef _GL_INLINE_HEADER_BEGIN ++ #error "Please include config.h first." ++#endif ++_GL_INLINE_HEADER_BEGIN ++ ++#endif /* _MBFILE_H */ +diff --git a/m4/mbfile.m4 b/m4/mbfile.m4 +new file mode 100644 +index 0000000..8589902 +--- /dev/null ++++ b/m4/mbfile.m4 +@@ -0,0 +1,14 @@ ++# mbfile.m4 serial 7 ++dnl Copyright (C) 2005, 2008-2015 Free Software Foundation, Inc. ++dnl This file is free software; the Free Software Foundation ++dnl gives unlimited permission to copy and/or distribute it, ++dnl with or without modifications, as long as this notice is preserved. ++ ++dnl autoconf tests required for use of mbfile.h ++dnl From Bruno Haible. ++ ++AC_DEFUN([gl_MBFILE], ++[ ++ AC_REQUIRE([AC_TYPE_MBSTATE_T]) ++ : ++]) +diff --git a/src/cut.c b/src/cut.c +index 6fd8978..faef877 100644 +--- a/src/cut.c ++++ b/src/cut.c +@@ -28,6 +28,11 @@ + #include + #include + #include ++ ++/* Get mbstate_t, mbrtowc(). */ ++#if HAVE_WCHAR_H ++# include ++#endif + #include "system.h" + + #include "error.h" +@@ -37,6 +42,18 @@ + + #include "set-fields.h" + ++/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC ++ installation; work around this configuration error. */ ++#if !defined MB_LEN_MAX || MB_LEN_MAX < 2 ++# undef MB_LEN_MAX ++# define MB_LEN_MAX 16 ++#endif ++ ++/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ ++#if HAVE_MBRTOWC && defined mbstate_t ++# define mbrtowc(pwc, s, n, ps) (mbrtowc) (pwc, s, n, 0) ++#endif ++ + /* The official name of this program (e.g., no 'g' prefix). */ + #define PROGRAM_NAME "cut" + +@@ -53,6 +70,52 @@ + } \ + while (0) + ++/* Refill the buffer BUF to get a multibyte character. */ ++#define REFILL_BUFFER(BUF, BUFPOS, BUFLEN, STREAM) \ ++ do \ ++ { \ ++ if (BUFLEN < MB_LEN_MAX && !feof (STREAM) && !ferror (STREAM)) \ ++ { \ ++ memmove (BUF, BUFPOS, BUFLEN); \ ++ BUFLEN += fread (BUF + BUFLEN, sizeof(char), BUFSIZ, STREAM); \ ++ BUFPOS = BUF; \ ++ } \ ++ } \ ++ while (0) ++ ++/* Get wide character on BUFPOS. BUFPOS is not included after that. ++ If byte sequence is not valid as a character, CONVFAIL is true. Otherwise false. */ ++#define GET_NEXT_WC_FROM_BUFFER(WC, BUFPOS, BUFLEN, MBLENGTH, STATE, CONVFAIL) \ ++ do \ ++ { \ ++ mbstate_t state_bak; \ ++ \ ++ if (BUFLEN < 1) \ ++ { \ ++ WC = WEOF; \ ++ break; \ ++ } \ ++ \ ++ /* Get a wide character. */ \ ++ CONVFAIL = false; \ ++ state_bak = STATE; \ ++ MBLENGTH = mbrtowc ((wchar_t *)&WC, BUFPOS, BUFLEN, &STATE); \ ++ \ ++ switch (MBLENGTH) \ ++ { \ ++ case (size_t)-1: \ ++ case (size_t)-2: \ ++ CONVFAIL = true; \ ++ STATE = state_bak; \ ++ /* Fall througn. */ \ ++ \ ++ case 0: \ ++ MBLENGTH = 1; \ ++ break; \ ++ } \ ++ } \ ++ while (0) ++ + + /* Pointer inside RP. When checking if a byte or field is selected + by a finite range, we check if it is between CURRENT_RP.LO +@@ -60,6 +123,9 @@ + CURRENT_RP.HI then we make CURRENT_RP to point to the next range pair. */ + static struct field_range_pair *current_rp; + ++/* Length of the delimiter given as argument to -d. */ ++size_t delimlen; ++ + /* This buffer is used to support the semantics of the -s option + (or lack of same) when the specified field list includes (does + not include) the first field. In both of those cases, the entire +@@ -72,6 +138,29 @@ static char *field_1_buffer; + /* The number of bytes allocated for FIELD_1_BUFFER. */ + static size_t field_1_bufsize; + ++enum operating_mode ++ { ++ undefined_mode, ++ ++ /* Output bytes that are at the given positions. */ ++ byte_mode, ++ ++ /* Output characters that are at the given positions. */ ++ character_mode, ++ ++ /* Output the given delimiter-separated fields. */ ++ field_mode ++ }; ++ ++static enum operating_mode operating_mode; ++ ++/* If nonzero, when in byte mode, don't split multibyte characters. */ ++static int byte_mode_character_aware; ++ ++/* If nonzero, the function for single byte locale is work ++ if this program runs on multibyte locale. */ ++static int force_singlebyte_mode; ++ + /* If true do not output lines containing no delimiter characters. + Otherwise, all such lines are printed. This option is valid only + with field mode. */ +@@ -83,10 +172,16 @@ static bool complement; + + /* The delimiter character for field mode. */ + static unsigned char delim; ++#if HAVE_WCHAR_H ++static wchar_t wcdelim; ++#endif + + /* The delimiter for each line/record. */ + static unsigned char line_delim = '\n'; + ++/* True if the --output-delimiter=STRING option was specified. */ ++static bool output_delimiter_specified; ++ + /* The length of output_delimiter_string. */ + static size_t output_delimiter_length; + +@@ -94,9 +189,6 @@ static size_t output_delimiter_length; + string consisting of the input delimiter. */ + static char *output_delimiter_string; + +-/* The output delimiter string contents, if the default. */ +-static char output_delimiter_default[1]; +- + /* True if we have ever read standard input. */ + static bool have_read_stdin; + +@@ -150,7 +242,7 @@ Print selected parts of lines from each FILE to standard output.\n\ + -f, --fields=LIST select only these fields; also print any line\n\ + that contains no delimiter character, unless\n\ + the -s option is specified\n\ +- -n (ignored)\n\ ++ -n with -b: don't split multibyte characters\n\ + "), stdout); + fputs (_("\ + --complement complement the set of selected bytes, characters\n\ +@@ -250,7 +342,7 @@ cut_bytes (FILE *stream) + next_item (&byte_idx); + if (print_kth (byte_idx)) + { +- if (output_delimiter_string != output_delimiter_default) ++ if (output_delimiter_specified) + { + if (print_delimiter && is_range_start_index (byte_idx)) + { +@@ -266,6 +358,82 @@ cut_bytes (FILE *stream) + } + } + ++#if HAVE_MBRTOWC ++/* This function is in use for the following case. ++ ++ 1. Read from the stream STREAM, printing to standard output any selected ++ characters. ++ ++ 2. Read from stream STREAM, printing to standard output any selected bytes, ++ without splitting multibyte characters. */ ++ ++static void ++cut_characters_or_cut_bytes_no_split (FILE *stream) ++{ ++ uintmax_t idx; /* number of bytes or characters in the line so far. */ ++ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ ++ char *bufpos; /* Next read position of BUF. */ ++ size_t buflen; /* The length of the byte sequence in buf. */ ++ wint_t wc; /* A gotten wide character. */ ++ size_t mblength; /* The byte size of a multibyte character which shows ++ as same character as WC. */ ++ mbstate_t state; /* State of the stream. */ ++ bool convfail = false; /* true, when conversion failed. Otherwise false. */ ++ /* Whether to begin printing delimiters between ranges for the current line. ++ Set after we've begun printing data corresponding to the first range. */ ++ bool print_delimiter = false; ++ ++ idx = 0; ++ buflen = 0; ++ bufpos = buf; ++ memset (&state, '\0', sizeof(mbstate_t)); ++ ++ current_rp = frp; ++ ++ while (1) ++ { ++ REFILL_BUFFER (buf, bufpos, buflen, stream); ++ ++ GET_NEXT_WC_FROM_BUFFER (wc, bufpos, buflen, mblength, state, convfail); ++ (void) convfail; /* ignore unused */ ++ ++ if (wc == WEOF) ++ { ++ if (idx > 0) ++ putchar (line_delim); ++ break; ++ } ++ else if (wc == line_delim) ++ { ++ putchar (line_delim); ++ idx = 0; ++ print_delimiter = false; ++ current_rp = frp; ++ } ++ else ++ { ++ next_item (&idx); ++ if (print_kth (idx)) ++ { ++ if (output_delimiter_specified) ++ { ++ if (print_delimiter && is_range_start_index (idx)) ++ { ++ fwrite (output_delimiter_string, sizeof (char), ++ output_delimiter_length, stdout); ++ } ++ print_delimiter = true; ++ } ++ fwrite (bufpos, mblength, sizeof(char), stdout); ++ } ++ } ++ ++ buflen -= mblength; ++ bufpos += mblength; ++ } ++} ++#endif ++ + /* Read from stream STREAM, printing to standard output any selected fields. */ + + static void +@@ -411,11 +579,218 @@ cut_fields (FILE *stream) + } + } + +-/* Process file FILE to standard output, using CUT_STREAM. ++#if HAVE_MBRTOWC ++static void ++cut_fields_mb (FILE *stream) ++{ ++ int c; ++ uintmax_t field_idx; ++ int found_any_selected_field; ++ int buffer_first_field; ++ int empty_input; ++ char buf[MB_LEN_MAX + BUFSIZ]; /* For spooling a read byte sequence. */ ++ char *bufpos; /* Next read position of BUF. */ ++ size_t buflen; /* The length of the byte sequence in buf. */ ++ wint_t wc = 0; /* A gotten wide character. */ ++ size_t mblength; /* The byte size of a multibyte character which shows ++ as same character as WC. */ ++ mbstate_t state; /* State of the stream. */ ++ bool convfail = false; /* true, when conversion failed. Otherwise false. */ ++ ++ current_rp = frp; ++ ++ found_any_selected_field = 0; ++ field_idx = 1; ++ bufpos = buf; ++ buflen = 0; ++ memset (&state, '\0', sizeof(mbstate_t)); ++ ++ c = getc (stream); ++ empty_input = (c == EOF); ++ if (c != EOF) ++ { ++ ungetc (c, stream); ++ wc = 0; ++ } ++ else ++ wc = WEOF; ++ ++ /* To support the semantics of the -s flag, we may have to buffer ++ all of the first field to determine whether it is `delimited.' ++ But that is unnecessary if all non-delimited lines must be printed ++ and the first field has been selected, or if non-delimited lines ++ must be suppressed and the first field has *not* been selected. ++ That is because a non-delimited line has exactly one field. */ ++ buffer_first_field = (suppress_non_delimited ^ !print_kth (1)); ++ ++ while (1) ++ { ++ if (field_idx == 1 && buffer_first_field) ++ { ++ int len = 0; ++ ++ while (1) ++ { ++ REFILL_BUFFER (buf, bufpos, buflen, stream); ++ ++ GET_NEXT_WC_FROM_BUFFER ++ (wc, bufpos, buflen, mblength, state, convfail); ++ ++ if (wc == WEOF) ++ break; ++ ++ field_1_buffer = xrealloc (field_1_buffer, len + mblength); ++ memcpy (field_1_buffer + len, bufpos, mblength); ++ len += mblength; ++ buflen -= mblength; ++ bufpos += mblength; ++ ++ if (!convfail && (wc == line_delim || wc == wcdelim)) ++ break; ++ } ++ ++ if (len <= 0 && wc == WEOF) ++ break; ++ ++ /* If the first field extends to the end of line (it is not ++ delimited) and we are printing all non-delimited lines, ++ print this one. */ ++ if (convfail || (!convfail && wc != wcdelim)) ++ { ++ if (suppress_non_delimited) ++ { ++ /* Empty. */ ++ } ++ else ++ { ++ fwrite (field_1_buffer, sizeof (char), len, stdout); ++ /* Make sure the output line is newline terminated. */ ++ if (convfail || (!convfail && wc != line_delim)) ++ putchar (line_delim); ++ } ++ continue; ++ } ++ ++ if (print_kth (1)) ++ { ++ /* Print the field, but not the trailing delimiter. */ ++ fwrite (field_1_buffer, sizeof (char), len - 1, stdout); ++ found_any_selected_field = 1; ++ } ++ next_item (&field_idx); ++ } ++ ++ if (wc != WEOF) ++ { ++ if (print_kth (field_idx)) ++ { ++ if (found_any_selected_field) ++ { ++ fwrite (output_delimiter_string, sizeof (char), ++ output_delimiter_length, stdout); ++ } ++ found_any_selected_field = 1; ++ } ++ ++ while (1) ++ { ++ REFILL_BUFFER (buf, bufpos, buflen, stream); ++ ++ GET_NEXT_WC_FROM_BUFFER ++ (wc, bufpos, buflen, mblength, state, convfail); ++ ++ if (wc == WEOF) ++ break; ++ else if (!convfail && (wc == wcdelim || wc == line_delim)) ++ { ++ buflen -= mblength; ++ bufpos += mblength; ++ break; ++ } ++ ++ if (print_kth (field_idx)) ++ fwrite (bufpos, mblength, sizeof(char), stdout); ++ ++ buflen -= mblength; ++ bufpos += mblength; ++ } ++ } ++ ++ if ((!convfail || wc == line_delim) && buflen < 1) ++ wc = WEOF; ++ ++ if (!convfail && wc == wcdelim) ++ next_item (&field_idx); ++ else if (wc == WEOF || (!convfail && wc == line_delim)) ++ { ++ if (found_any_selected_field ++ || (!empty_input && !(suppress_non_delimited && field_idx == 1))) ++ putchar (line_delim); ++ if (wc == WEOF) ++ break; ++ field_idx = 1; ++ current_rp = frp; ++ found_any_selected_field = 0; ++ } ++ } ++} ++#endif ++ ++static void ++cut_stream (FILE *stream) ++{ ++#if HAVE_MBRTOWC ++ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) ++ { ++ switch (operating_mode) ++ { ++ case byte_mode: ++ if (byte_mode_character_aware) ++ cut_characters_or_cut_bytes_no_split (stream); ++ else ++ cut_bytes (stream); ++ break; ++ ++ case character_mode: ++ cut_characters_or_cut_bytes_no_split (stream); ++ break; ++ ++ case field_mode: ++ if (delimlen == 1) ++ { ++ /* Check if we have utf8 multibyte locale, so we can use this ++ optimization because of uniqueness of characters, which is ++ not true for e.g. SJIS */ ++ char * loc = setlocale(LC_CTYPE, NULL); ++ if (loc && (strstr (loc, "UTF-8") || strstr (loc, "utf-8") || ++ strstr (loc, "UTF8") || strstr (loc, "utf8"))) ++ { ++ cut_fields (stream); ++ break; ++ } ++ } ++ cut_fields_mb (stream); ++ break; ++ ++ default: ++ abort (); ++ } ++ } ++ else ++#endif ++ { ++ if (operating_mode == field_mode) ++ cut_fields (stream); ++ else ++ cut_bytes (stream); ++ } ++} ++ ++/* Process file FILE to standard output. + Return true if successful. */ + + static bool +-cut_file (char const *file, void (*cut_stream) (FILE *)) ++cut_file (char const *file) + { + FILE *stream; + +@@ -459,8 +834,8 @@ main (int argc, char **argv) + int optc; + bool ok; + bool delim_specified = false; +- bool byte_mode = false; +- char *spec_list_string = NULL; ++ char *spec_list_string IF_LINT ( = NULL); ++ char mbdelim[MB_LEN_MAX + 1]; + + initialize_main (&argc, &argv); + set_program_name (argv[0]); +@@ -470,6 +845,8 @@ main (int argc, char **argv) + + atexit (close_stdout); + ++ operating_mode = undefined_mode; ++ + /* By default, all non-delimited lines are printed. */ + suppress_non_delimited = false; + +@@ -481,35 +858,77 @@ main (int argc, char **argv) + switch (optc) + { + case 'b': +- case 'c': + /* Build the byte list. */ +- byte_mode = true; +- FALLTHROUGH; ++ if (operating_mode != undefined_mode) ++ FATAL_ERROR (_("only one type of list may be specified")); ++ operating_mode = byte_mode; ++ spec_list_string = optarg; ++ break; ++ ++ case 'c': ++ /* Build the character list. */ ++ if (operating_mode != undefined_mode) ++ FATAL_ERROR (_("only one type of list may be specified")); ++ operating_mode = character_mode; ++ spec_list_string = optarg; ++ break; ++ + case 'f': + /* Build the field list. */ +- if (spec_list_string) +- FATAL_ERROR (_("only one list may be specified")); ++ if (operating_mode != undefined_mode) ++ FATAL_ERROR (_("only one type of list may be specified")); ++ operating_mode = field_mode; + spec_list_string = optarg; + break; + + case 'd': + /* New delimiter. */ + /* Interpret -d '' to mean 'use the NUL byte as the delimiter.' */ +- if (optarg[0] != '\0' && optarg[1] != '\0') +- FATAL_ERROR (_("the delimiter must be a single character")); +- delim = optarg[0]; +- delim_specified = true; ++ { ++#if HAVE_MBRTOWC ++ if(MB_CUR_MAX > 1) ++ { ++ mbstate_t state; ++ ++ memset (&state, '\0', sizeof(mbstate_t)); ++ delimlen = mbrtowc (&wcdelim, optarg, strnlen(optarg, MB_LEN_MAX), &state); ++ ++ if (delimlen == (size_t)-1 || delimlen == (size_t)-2) ++ ++force_singlebyte_mode; ++ else ++ { ++ delimlen = (delimlen < 1) ? 1 : delimlen; ++ if (wcdelim != L'\0' && *(optarg + delimlen) != '\0') ++ FATAL_ERROR (_("the delimiter must be a single character")); ++ memcpy (mbdelim, optarg, delimlen); ++ mbdelim[delimlen] = '\0'; ++ if (delimlen == 1) ++ delim = *optarg; ++ } ++ } ++ ++ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) ++#endif ++ { ++ if (optarg[0] != '\0' && optarg[1] != '\0') ++ FATAL_ERROR (_("the delimiter must be a single character")); ++ delim = (unsigned char) optarg[0]; ++ } ++ delim_specified = true; ++ } + break; + + case OUTPUT_DELIMITER_OPTION: ++ output_delimiter_specified = true; + /* Interpret --output-delimiter='' to mean + 'use the NUL byte as the delimiter.' */ + output_delimiter_length = (optarg[0] == '\0' + ? 1 : strlen (optarg)); +- output_delimiter_string = optarg; ++ output_delimiter_string = xstrdup (optarg); + break; + + case 'n': ++ byte_mode_character_aware = 1; + break; + + case 's': +@@ -533,40 +952,57 @@ main (int argc, char **argv) + } + } + +- if (!spec_list_string) ++ if (operating_mode == undefined_mode) + FATAL_ERROR (_("you must specify a list of bytes, characters, or fields")); + +- if (byte_mode) +- { +- if (delim_specified) +- FATAL_ERROR (_("an input delimiter may be specified only\ ++ if (delim_specified && operating_mode != field_mode) ++ FATAL_ERROR (_("an input delimiter may be specified only\ + when operating on fields")); + +- if (suppress_non_delimited) +- FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\ ++ if (suppress_non_delimited && operating_mode != field_mode) ++ FATAL_ERROR (_("suppressing non-delimited lines makes sense\n\ + \tonly when operating on fields")); +- } + + set_fields (spec_list_string, +- ((byte_mode ? SETFLD_ERRMSG_USE_POS : 0) +- | (complement ? SETFLD_COMPLEMENT : 0))); ++ ( (operating_mode == field_mode) ? 0 : SETFLD_ERRMSG_USE_POS) ++ | (complement ? SETFLD_COMPLEMENT : 0) ); + + if (!delim_specified) +- delim = '\t'; ++ { ++ delim = '\t'; ++#ifdef HAVE_MBRTOWC ++ wcdelim = L'\t'; ++ mbdelim[0] = '\t'; ++ mbdelim[1] = '\0'; ++ delimlen = 1; ++#endif ++ } + + if (output_delimiter_string == NULL) + { +- output_delimiter_default[0] = delim; +- output_delimiter_string = output_delimiter_default; +- output_delimiter_length = 1; ++#ifdef HAVE_MBRTOWC ++ if (MB_CUR_MAX > 1 && !force_singlebyte_mode) ++ { ++ output_delimiter_string = xstrdup(mbdelim); ++ output_delimiter_length = delimlen; ++ } ++ ++ if (MB_CUR_MAX <= 1 || force_singlebyte_mode) ++#endif ++ { ++ static char dummy[2]; ++ dummy[0] = delim; ++ dummy[1] = '\0'; ++ output_delimiter_string = dummy; ++ output_delimiter_length = 1; ++ } + } + +- void (*cut_stream) (FILE *) = byte_mode ? cut_bytes : cut_fields; + if (optind == argc) +- ok = cut_file ("-", cut_stream); ++ ok = cut_file ("-"); + else + for (ok = true; optind < argc; optind++) +- ok &= cut_file (argv[optind], cut_stream); ++ ok &= cut_file (argv[optind]); + + + if (have_read_stdin && fclose (stdin) == EOF) +diff --git a/src/expand-common.c b/src/expand-common.c +index deec1bd..b39f740 100644 +--- a/src/expand-common.c ++++ b/src/expand-common.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include "system.h" + #include "die.h" + #include "error.h" +@@ -125,6 +126,119 @@ set_increment_size (uintmax_t tabval) + return ok; + } + ++extern int ++set_utf_locale (void) ++{ ++ /*try using some predefined locale */ ++ const char* predef_locales[] = {"C.UTF8","en_US.UTF8","en_GB.UTF8"}; ++ ++ const int predef_locales_count=3; ++ for (int i=0;ibufcount=0; ++ if (c == 0xEF) ++ { ++ c=fgetc(fp); ++ } ++ else ++ { ++ if (c != EOF) ++ { ++ ungetc(c,fp); ++ } ++ return false; ++ } ++ ++ if (c == 0xBB) ++ { ++ c=fgetc(fp); ++ } ++ else ++ { ++ if ( c!= EOF ) ++ { ++ mbf->buf[0]=(unsigned char) 0xEF; ++ mbf->bufcount=1; ++ ungetc(c,fp); ++ return false; ++ } ++ else ++ { ++ ungetc(0xEF,fp); ++ return false; ++ } ++ } ++ if (c == 0xBF) ++ { ++ mbf->bufcount=0; ++ return true; ++ } ++ else ++ { ++ if (c != EOF) ++ { ++ mbf->buf[0]=(unsigned char) 0xEF; ++ mbf->buf[1]=(unsigned char) 0xBB; ++ mbf->bufcount=2; ++ ungetc(c,fp); ++ return false; ++ } ++ else ++ { ++ mbf->buf[0]=(unsigned char) 0xEF; ++ mbf->bufcount=1; ++ ungetc(0xBB,fp); ++ return false; ++ } ++ } ++ return false; ++} ++ ++extern void ++print_bom(void) ++{ ++ putc (0xEF, stdout); ++ putc (0xBB, stdout); ++ putc (0xBF, stdout); ++} ++ + /* Add the comma or blank separated list of tab stops STOPS + to the list of tab stops. */ + extern void +diff --git a/src/expand-common.h b/src/expand-common.h +index 5f59a0e..835b9d5 100644 +--- a/src/expand-common.h ++++ b/src/expand-common.h +@@ -25,6 +25,18 @@ extern size_t max_column_width; + /* The desired exit status. */ + extern int exit_status; + ++extern int ++set_utf_locale (void); ++ ++extern bool ++check_utf_locale(void); ++ ++extern bool ++check_bom(FILE* fp, mb_file_t *mbf); ++ ++extern void ++print_bom(void); ++ + /* Add tab stop TABVAL to the end of 'tab_list'. */ + extern void + add_tab_stop (uintmax_t tabval); +diff --git a/src/expand.c b/src/expand.c +index ed78ca8..a4cefa1 100644 +--- a/src/expand.c ++++ b/src/expand.c +@@ -37,6 +37,9 @@ + #include + #include + #include ++ ++#include ++ + #include "system.h" + #include "die.h" + +@@ -97,19 +100,41 @@ expand (void) + { + /* Input stream. */ + FILE *fp = next_file (NULL); ++ mb_file_t mbf; ++ mbf_char_t c; ++ /* True if the starting locale is utf8. */ ++ bool using_utf_locale; ++ ++ /* True if the first file contains BOM header. */ ++ bool found_bom; ++ using_utf_locale=check_utf_locale(); + + if (!fp) + return; ++ mbf_init (mbf, fp); ++ found_bom=check_bom(fp,&mbf); + +- while (true) ++ if (using_utf_locale == false && found_bom == true) ++ { ++ /*try using some predefined locale */ ++ ++ if (set_utf_locale () != 0) + { +- /* Input character, or EOF. */ +- int c; ++ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); ++ } ++ } ++ ++ ++ if (found_bom == true) ++ { ++ print_bom(); ++ } + ++ while (true) ++ { + /* If true, perform translations. */ + bool convert = true; + +- + /* The following variables have valid values only when CONVERT + is true: */ + +@@ -119,17 +144,48 @@ expand (void) + /* Index in TAB_LIST of next tab stop to examine. */ + size_t tab_index = 0; + +- + /* Convert a line of text. */ + + do + { +- while ((c = getc (fp)) < 0 && (fp = next_file (fp))) +- continue; ++ while (true) { ++ mbf_getc (c, mbf); ++ if ((mb_iseof (c)) && (fp = next_file (fp))) ++ { ++ mbf_init (mbf, fp); ++ if (fp!=NULL) ++ { ++ if (check_bom(fp,&mbf)==true) ++ { ++ /*Not the first file - check BOM header*/ ++ if (using_utf_locale==false && found_bom==false) ++ { ++ /*BOM header in subsequent file but not in the first one. */ ++ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); ++ } ++ } ++ else ++ { ++ if(using_utf_locale==false && found_bom==true) ++ { ++ /*First file conatined BOM header - locale was switched to UTF ++ *all subsequent files should contain BOM. */ ++ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); ++ } ++ } ++ } ++ continue; ++ } ++ else ++ { ++ break; ++ } ++ } ++ + + if (convert) + { +- if (c == '\t') ++ if (mb_iseq (c, '\t')) + { + /* Column the next input tab stop is on. */ + uintmax_t next_tab_column; +@@ -148,32 +204,34 @@ expand (void) + if (putchar (' ') < 0) + die (EXIT_FAILURE, errno, _("write error")); + +- c = ' '; ++ mb_setascii (&c, ' '); + } +- else if (c == '\b') ++ else if (mb_iseq (c, '\b')) + { + /* Go back one column, and force recalculation of the + next tab stop. */ + column -= !!column; + tab_index -= !!tab_index; + } +- else ++ /* A leading control character could make us trip over. */ ++ else if (!mb_iscntrl (c)) + { +- column++; ++ column += mb_width (c); + if (!column) + die (EXIT_FAILURE, 0, _("input line is too long")); + } + +- convert &= convert_entire_line || !! isblank (c); ++ convert &= convert_entire_line || mb_isblank (c); + } + +- if (c < 0) ++ if (mb_iseof (c)) + return; + +- if (putchar (c) < 0) ++ mb_putc (c, stdout); ++ if (ferror (stdout)) + die (EXIT_FAILURE, errno, _("write error")); + } +- while (c != '\n'); ++ while (!mb_iseq (c, '\n')); + } + } - /* Initialize linebuffer LINEBUFFER for use. */ diff --git a/src/fold.c b/src/fold.c -index 8cd0d6b..d23edd5 100644 +index f07a90b..d32dbfd 100644 --- a/src/fold.c +++ b/src/fold.c @@ -22,12 +22,34 @@ @@ -203,12 +1514,15 @@ index 8cd0d6b..d23edd5 100644 /* Look for the last blank. */ while (logical_end) { -@@ -215,11 +252,221 @@ fold_file (char const *filename, size_t width) +@@ -215,13 +252,225 @@ fold_file (char const *filename, size_t width) line_out[offset_out++] = c; } - saved_errno = errno; + *saved_errno = errno; + if (!ferror (istream)) +- saved_errno = 0; ++ *saved_errno = 0; if (offset_out) fwrite (line_out, sizeof (char), (size_t) offset_out, stdout); @@ -384,6 +1698,8 @@ index 8cd0d6b..d23edd5 100644 + } + + *saved_errno = errno; ++ if (!ferror (istream)) ++ *saved_errno = 0; + + if (offset_out) + fwrite (line_out, sizeof (char), (size_t) offset_out, stdout); @@ -423,10 +1739,10 @@ index 8cd0d6b..d23edd5 100644 +#endif + fold_text (istream, width, &saved_errno); + - if (ferror (istream)) - { - error (0, saved_errno, "%s", quotef (filename)); -@@ -252,7 +499,8 @@ main (int argc, char **argv) + if (STREQ (filename, "-")) + clearerr (istream); + else if (fclose (istream) != 0 && !saved_errno) +@@ -252,7 +501,8 @@ main (int argc, char **argv) atexit (close_stdout); @@ -436,7 +1752,7 @@ index 8cd0d6b..d23edd5 100644 while ((optc = getopt_long (argc, argv, shortopts, longopts, NULL)) != -1) { -@@ -261,7 +509,15 @@ main (int argc, char **argv) +@@ -261,7 +511,15 @@ main (int argc, char **argv) switch (optc) { case 'b': /* Count bytes rather than columns. */ @@ -454,7 +1770,7 @@ index 8cd0d6b..d23edd5 100644 case 's': /* Break at word boundaries. */ diff --git a/src/join.c b/src/join.c -index 98b461c..9990f38 100644 +index f2fd172..6c7d1ed 100644 --- a/src/join.c +++ b/src/join.c @@ -22,19 +22,33 @@ @@ -509,7 +1825,7 @@ index 98b461c..9990f38 100644 /* If nonzero, check that the input is correctly ordered. */ static enum -@@ -276,13 +292,14 @@ xfields (struct line *line) +@@ -280,13 +296,14 @@ xfields (struct line *line) if (ptr == lim) return; @@ -527,7 +1843,7 @@ index 98b461c..9990f38 100644 { /* Skip leading blanks before the first field. */ while (field_sep (*ptr)) -@@ -306,6 +323,147 @@ xfields (struct line *line) +@@ -310,6 +327,147 @@ xfields (struct line *line) extract_field (line, ptr, lim - ptr); } @@ -675,7 +1991,7 @@ index 98b461c..9990f38 100644 static void freeline (struct line *line) { -@@ -327,56 +485,133 @@ keycmp (struct line const *line1, struct line const *line2, +@@ -331,56 +489,133 @@ keycmp (struct line const *line1, struct line const *line2, size_t jf_1, size_t jf_2) { /* Start of field to compare in each file. */ @@ -832,7 +2148,7 @@ index 98b461c..9990f38 100644 } /* Check that successive input lines PREV and CURRENT from input file -@@ -468,6 +703,11 @@ get_line (FILE *fp, struct line **linep, int which) +@@ -472,6 +707,11 @@ get_line (FILE *fp, struct line **linep, int which) } ++line_no[which - 1]; @@ -844,7 +2160,7 @@ index 98b461c..9990f38 100644 xfields (line); if (prevline[which - 1]) -@@ -563,21 +803,28 @@ prfield (size_t n, struct line const *line) +@@ -567,21 +807,28 @@ prfield (size_t n, struct line const *line) /* Output all the fields in line, other than the join field. */ @@ -876,7 +2192,7 @@ index 98b461c..9990f38 100644 prfield (i, line); } } -@@ -588,7 +835,6 @@ static void +@@ -592,7 +839,6 @@ static void prjoin (struct line const *line1, struct line const *line2) { const struct outlist *outlist; @@ -884,7 +2200,7 @@ index 98b461c..9990f38 100644 size_t field; struct line const *line; -@@ -622,7 +868,7 @@ prjoin (struct line const *line1, struct line const *line2) +@@ -626,7 +872,7 @@ prjoin (struct line const *line1, struct line const *line2) o = o->next; if (o == NULL) break; @@ -893,7 +2209,7 @@ index 98b461c..9990f38 100644 } putchar (eolchar); } -@@ -1099,20 +1345,43 @@ main (int argc, char **argv) +@@ -1102,20 +1348,43 @@ main (int argc, char **argv) case 't': { @@ -946,8 +2262,23 @@ index 98b461c..9990f38 100644 } break; +diff --git a/src/local.mk b/src/local.mk +index e1d15ce..1a5ffaa 100644 +--- a/src/local.mk ++++ b/src/local.mk +@@ -434,8 +434,8 @@ src_base32_CPPFLAGS = -DBASE_TYPE=32 $(AM_CPPFLAGS) + src_basenc_SOURCES = src/basenc.c + src_basenc_CPPFLAGS = -DBASE_TYPE=42 $(AM_CPPFLAGS) + +-src_expand_SOURCES = src/expand.c src/expand-common.c +-src_unexpand_SOURCES = src/unexpand.c src/expand-common.c ++src_expand_SOURCES = src/expand.c src/expand-common.c lib/mbfile.c ++src_unexpand_SOURCES = src/unexpand.c src/expand-common.c lib/mbfile.c + + src_wc_SOURCES = src/wc.c + if USE_AVX2_WC_LINECOUNT diff --git a/src/pr.c b/src/pr.c -index 26f221f..633f50e 100644 +index 4c17c00..b4fab1c 100644 --- a/src/pr.c +++ b/src/pr.c @@ -311,6 +311,24 @@ @@ -975,8 +2306,8 @@ index 26f221f..633f50e 100644 #include "system.h" #include "die.h" #include "error.h" -@@ -324,6 +342,18 @@ - #include "xstrtol.h" +@@ -325,6 +343,18 @@ + #include "xstrtol-error.h" #include "xdectoint.h" +/* Some systems, like BeOS, have multibyte encodings but lack mbstate_t. */ @@ -994,7 +2325,7 @@ index 26f221f..633f50e 100644 /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "pr" -@@ -416,7 +446,20 @@ struct COLUMN +@@ -417,7 +447,20 @@ struct COLUMN typedef struct COLUMN COLUMN; @@ -1016,23 +2347,23 @@ index 26f221f..633f50e 100644 static bool read_line (COLUMN *p); static bool print_page (void); static bool print_stored (COLUMN *p); -@@ -428,6 +471,7 @@ static void add_line_number (COLUMN *p); - static void getoptnum (const char *n_str, int min, int *num, - const char *errfmt); +@@ -429,6 +472,7 @@ static void add_line_number (COLUMN *p); + static void getoptnum (char const *n_str, int min, int *num, + char const *errfmt); static void getoptarg (char *arg, char switch_char, char *character, + int *character_length, int *character_width, int *number); static void print_files (int number_of_files, char **av); static void init_parameters (int number_of_files); -@@ -441,7 +485,6 @@ static void store_char (char c); +@@ -442,7 +486,6 @@ static void store_char (char c); static void pad_down (unsigned int lines); static void read_rest_of_line (COLUMN *p); static void skip_read (COLUMN *p, int column_number); -static void print_char (char c); static void cleanup (void); static void print_sep_string (void); - static void separator_string (const char *optarg_S); -@@ -453,7 +496,7 @@ static COLUMN *column_vector; + static void separator_string (char const *optarg_S); +@@ -454,7 +497,7 @@ static COLUMN *column_vector; we store the leftmost columns contiguously in buff. To print a line from buff, get the index of the first character from line_vector[i], and print up to line_vector[i + 1]. */ @@ -1041,7 +2372,7 @@ index 26f221f..633f50e 100644 /* Index of the position in buff where the next character will be stored. */ -@@ -557,7 +600,7 @@ static int chars_per_column; +@@ -558,7 +601,7 @@ static int chars_per_column; static bool untabify_input = false; /* (-e) The input tab character. */ @@ -1050,7 +2381,7 @@ index 26f221f..633f50e 100644 /* (-e) Tabstops are at chars_per_tab, 2*chars_per_tab, 3*chars_per_tab, ... where the leftmost column is 1. */ -@@ -567,7 +610,10 @@ static int chars_per_input_tab = 8; +@@ -568,7 +611,10 @@ static int chars_per_input_tab = 8; static bool tabify_output = false; /* (-i) The output tab character. */ @@ -1062,7 +2393,7 @@ index 26f221f..633f50e 100644 /* (-i) The width of the output tab. */ static int chars_per_output_tab = 8; -@@ -637,7 +683,13 @@ static int line_number; +@@ -638,7 +684,13 @@ static int line_number; static bool numbered_lines = false; /* (-n) Character which follows each line number. */ @@ -1077,7 +2408,7 @@ index 26f221f..633f50e 100644 /* (-n) line counting starts with 1st line of input file (not with 1st line of 1st page printed). */ -@@ -690,6 +742,7 @@ static bool use_col_separator = false; +@@ -691,6 +743,7 @@ static bool use_col_separator = false; -a|COLUMN|-m is a 'space' and with the -J option a 'tab'. */ static char const *col_sep_string = ""; static int col_sep_length = 0; @@ -1085,7 +2416,7 @@ index 26f221f..633f50e 100644 static char *column_separator = (char *) " "; static char *line_separator = (char *) "\t"; -@@ -851,6 +904,13 @@ separator_string (const char *optarg_S) +@@ -853,6 +906,13 @@ separator_string (char const *optarg_S) integer_overflow (); col_sep_length = len; col_sep_string = optarg_S; @@ -1099,7 +2430,7 @@ index 26f221f..633f50e 100644 } int -@@ -875,6 +935,21 @@ main (int argc, char **argv) +@@ -877,6 +937,21 @@ main (int argc, char **argv) atexit (close_stdout); @@ -1121,7 +2452,7 @@ index 26f221f..633f50e 100644 n_files = 0; file_names = (argc > 1 ? xnmalloc (argc - 1, sizeof (char *)) -@@ -951,8 +1026,12 @@ main (int argc, char **argv) +@@ -953,8 +1028,12 @@ main (int argc, char **argv) break; case 'e': if (optarg) @@ -1136,7 +2467,7 @@ index 26f221f..633f50e 100644 /* Could check tab width > 0. */ untabify_input = true; break; -@@ -965,8 +1044,12 @@ main (int argc, char **argv) +@@ -967,8 +1046,12 @@ main (int argc, char **argv) break; case 'i': if (optarg) @@ -1151,7 +2482,7 @@ index 26f221f..633f50e 100644 /* Could check tab width > 0. */ tabify_output = true; break; -@@ -984,8 +1067,8 @@ main (int argc, char **argv) +@@ -986,8 +1069,8 @@ main (int argc, char **argv) case 'n': numbered_lines = true; if (optarg) @@ -1162,7 +2493,7 @@ index 26f221f..633f50e 100644 break; case 'N': skip_count = false; -@@ -1010,6 +1093,7 @@ main (int argc, char **argv) +@@ -1012,6 +1095,7 @@ main (int argc, char **argv) /* Reset an additional input of -s, -S dominates -s */ col_sep_string = ""; col_sep_length = 0; @@ -1170,7 +2501,7 @@ index 26f221f..633f50e 100644 use_col_separator = true; if (optarg) separator_string (optarg); -@@ -1165,10 +1249,45 @@ getoptnum (const char *n_str, int min, int *num, const char *err) +@@ -1166,10 +1250,45 @@ getoptnum (char const *n_str, int min, int *num, char const *err) a number. */ static void @@ -1218,7 +2549,7 @@ index 26f221f..633f50e 100644 if (*arg) { long int tmp_long; -@@ -1190,6 +1309,11 @@ static void +@@ -1191,6 +1310,11 @@ static void init_parameters (int number_of_files) { int chars_used_by_number = 0; @@ -1230,7 +2561,7 @@ index 26f221f..633f50e 100644 lines_per_body = lines_per_page - lines_per_header - lines_per_footer; if (lines_per_body <= 0) -@@ -1227,7 +1351,7 @@ init_parameters (int number_of_files) +@@ -1228,7 +1352,7 @@ init_parameters (int number_of_files) else col_sep_string = column_separator; @@ -1239,7 +2570,7 @@ index 26f221f..633f50e 100644 use_col_separator = true; } /* It's rather pointless to define a TAB separator with column -@@ -1257,11 +1381,11 @@ init_parameters (int number_of_files) +@@ -1260,11 +1384,11 @@ init_parameters (int number_of_files) + TAB_WIDTH (chars_per_input_tab, chars_per_number); */ /* Estimate chars_per_text without any margin and keep it constant. */ @@ -1253,7 +2584,7 @@ index 26f221f..633f50e 100644 /* The number is part of the column width unless we are printing files in parallel. */ -@@ -1270,7 +1394,7 @@ init_parameters (int number_of_files) +@@ -1273,7 +1397,7 @@ init_parameters (int number_of_files) } int sep_chars, useful_chars; @@ -1262,7 +2593,7 @@ index 26f221f..633f50e 100644 sep_chars = INT_MAX; if (INT_SUBTRACT_WRAPV (chars_per_line - chars_used_by_number, sep_chars, &useful_chars)) -@@ -1293,7 +1417,7 @@ init_parameters (int number_of_files) +@@ -1296,7 +1420,7 @@ init_parameters (int number_of_files) We've to use 8 as the lower limit, if we use chars_per_default_tab = 8 to expand a tab which is not an input_tab-char. */ free (clump_buff); @@ -1271,7 +2602,7 @@ index 26f221f..633f50e 100644 } /* Open the necessary files, -@@ -1399,7 +1523,7 @@ init_funcs (void) +@@ -1402,7 +1526,7 @@ init_funcs (void) /* Enlarge p->start_position of first column to use the same form of padding_not_printed with all columns. */ @@ -1280,7 +2611,7 @@ index 26f221f..633f50e 100644 /* This loop takes care of all but the rightmost column. */ -@@ -1433,7 +1557,7 @@ init_funcs (void) +@@ -1436,7 +1560,7 @@ init_funcs (void) } else { @@ -1289,7 +2620,7 @@ index 26f221f..633f50e 100644 h_next = h + chars_per_column; } } -@@ -1724,9 +1848,9 @@ static void +@@ -1733,9 +1857,9 @@ static void align_column (COLUMN *p) { padding_not_printed = p->start_position; @@ -1301,7 +2632,7 @@ index 26f221f..633f50e 100644 padding_not_printed = ANYWHERE; } -@@ -2001,13 +2125,13 @@ store_char (char c) +@@ -2010,13 +2134,13 @@ store_char (char c) /* May be too generous. */ buff = X2REALLOC (buff, &buff_allocated); } @@ -1317,7 +2648,7 @@ index 26f221f..633f50e 100644 char *s; int num_width; -@@ -2024,22 +2148,24 @@ add_line_number (COLUMN *p) +@@ -2033,22 +2157,24 @@ add_line_number (COLUMN *p) /* Tabification is assumed for multiple columns, also for n-separators, but 'default n-separator = TAB' hasn't been given priority over equal column_width also specified by POSIX. */ @@ -1346,7 +2677,7 @@ index 26f221f..633f50e 100644 output_position = POS_AFTER_TAB (chars_per_output_tab, output_position); } -@@ -2198,7 +2324,7 @@ print_white_space (void) +@@ -2207,7 +2333,7 @@ print_white_space (void) while (goal - h_old > 1 && (h_new = POS_AFTER_TAB (chars_per_output_tab, h_old)) <= goal) { @@ -1355,7 +2686,7 @@ index 26f221f..633f50e 100644 h_old = h_new; } while (++h_old <= goal) -@@ -2218,6 +2344,7 @@ print_sep_string (void) +@@ -2227,6 +2353,7 @@ print_sep_string (void) { char const *s = col_sep_string; int l = col_sep_length; @@ -1363,7 +2694,7 @@ index 26f221f..633f50e 100644 if (separators_not_printed <= 0) { -@@ -2229,6 +2356,7 @@ print_sep_string (void) +@@ -2238,6 +2365,7 @@ print_sep_string (void) { for (; separators_not_printed > 0; --separators_not_printed) { @@ -1371,7 +2702,7 @@ index 26f221f..633f50e 100644 while (l-- > 0) { /* 3 types of sep_strings: spaces only, spaces and chars, -@@ -2242,12 +2370,15 @@ print_sep_string (void) +@@ -2251,12 +2379,15 @@ print_sep_string (void) } else { @@ -1388,7 +2719,7 @@ index 26f221f..633f50e 100644 /* sep_string ends with some spaces */ if (spaces_not_printed > 0) print_white_space (); -@@ -2275,7 +2406,7 @@ print_clump (COLUMN *p, int n, char *clump) +@@ -2284,7 +2415,7 @@ print_clump (COLUMN *p, int n, char *clump) required number of tabs and spaces. */ static void @@ -1397,7 +2728,7 @@ index 26f221f..633f50e 100644 { if (tabify_output) { -@@ -2299,6 +2430,74 @@ print_char (char c) +@@ -2308,6 +2439,74 @@ print_char (char c) putchar (c); } @@ -1472,7 +2803,7 @@ index 26f221f..633f50e 100644 /* Skip to page PAGE before printing. PAGE may be larger than total number of pages. */ -@@ -2476,9 +2675,9 @@ read_line (COLUMN *p) +@@ -2485,9 +2684,9 @@ read_line (COLUMN *p) align_empty_cols = false; } @@ -1484,7 +2815,7 @@ index 26f221f..633f50e 100644 padding_not_printed = ANYWHERE; } -@@ -2547,7 +2746,7 @@ print_stored (COLUMN *p) +@@ -2556,7 +2755,7 @@ print_stored (COLUMN *p) COLUMN *q; int line = p->current_line++; @@ -1493,7 +2824,7 @@ index 26f221f..633f50e 100644 /* FIXME UMR: Uninitialized memory read: * This is occurring while in: -@@ -2559,7 +2758,7 @@ print_stored (COLUMN *p) +@@ -2568,7 +2767,7 @@ print_stored (COLUMN *p) xmalloc [xmalloc.c:94] init_store_cols [pr.c:1648] */ @@ -1502,7 +2833,7 @@ index 26f221f..633f50e 100644 pad_vertically = true; -@@ -2579,9 +2778,9 @@ print_stored (COLUMN *p) +@@ -2588,9 +2787,9 @@ print_stored (COLUMN *p) } } @@ -1514,7 +2845,7 @@ index 26f221f..633f50e 100644 padding_not_printed = ANYWHERE; } -@@ -2594,8 +2793,8 @@ print_stored (COLUMN *p) +@@ -2603,8 +2802,8 @@ print_stored (COLUMN *p) if (spaces_not_printed == 0) { output_position = p->start_position + end_vector[line]; @@ -1525,7 +2856,7 @@ index 26f221f..633f50e 100644 } return true; -@@ -2614,7 +2813,7 @@ print_stored (COLUMN *p) +@@ -2623,7 +2822,7 @@ print_stored (COLUMN *p) number of characters is 1.) */ static int @@ -1534,7 +2865,7 @@ index 26f221f..633f50e 100644 { unsigned char uc = c; char *s = clump_buff; -@@ -2624,10 +2823,10 @@ char_to_clump (char c) +@@ -2633,10 +2832,10 @@ char_to_clump (char c) int chars; int chars_per_c = 8; @@ -1547,7 +2878,7 @@ index 26f221f..633f50e 100644 { width = TAB_WIDTH (chars_per_c, input_position); -@@ -2708,6 +2907,164 @@ char_to_clump (char c) +@@ -2717,6 +2916,164 @@ char_to_clump (char c) return chars; } @@ -1713,7 +3044,7 @@ index 26f221f..633f50e 100644 looking for more options and printing the next batch of files. diff --git a/src/sort.c b/src/sort.c -index 6d2eec5..f189a0d 100644 +index 3b775d6..a0ba243 100644 --- a/src/sort.c +++ b/src/sort.c @@ -29,6 +29,14 @@ @@ -1731,9 +3062,9 @@ index 6d2eec5..f189a0d 100644 #include "system.h" #include "argmatch.h" #include "die.h" -@@ -161,14 +169,39 @@ static int decimal_point; - /* Thousands separator; if -1, then there isn't one. */ - static int thousands_sep; +@@ -159,14 +167,39 @@ static int thousands_sep; + /* We currently ignore multi-byte grouping chars. */ + static bool thousands_sep_ignored; +/* True if -f is specified. */ +static bool folding; @@ -1772,9 +3103,9 @@ index 6d2eec5..f189a0d 100644 /* The kind of blanks for '-b' to skip in various options. */ enum blanktype { bl_start, bl_end, bl_both }; -@@ -342,13 +375,11 @@ static bool reverse; - they were read if all keys compare equal. */ - static bool stable; +@@ -343,13 +376,11 @@ static bool stable; + /* An int value outside char range. */ + enum { NON_CHAR = CHAR_MAX + 1 }; -/* If TAB has this value, blanks separate fields. */ -enum { TAB_DEFAULT = CHAR_MAX + 1 }; @@ -1789,7 +3120,7 @@ index 6d2eec5..f189a0d 100644 /* Flag to remove consecutive duplicate lines from the output. Only the last of a sequence of equal lines will be output. */ -@@ -806,6 +837,46 @@ reap_all (void) +@@ -805,6 +836,46 @@ reap_all (void) reap (-1); } @@ -1836,7 +3167,7 @@ index 6d2eec5..f189a0d 100644 /* Clean up any remaining temporary files. */ static void -@@ -1274,7 +1345,7 @@ zaptemp (char const *name) +@@ -1272,7 +1343,7 @@ zaptemp (char const *name) free (node); } @@ -1845,7 +3176,7 @@ index 6d2eec5..f189a0d 100644 static int struct_month_cmp (void const *m1, void const *m2) -@@ -1289,7 +1360,7 @@ struct_month_cmp (void const *m1, void const *m2) +@@ -1287,7 +1358,7 @@ struct_month_cmp (void const *m1, void const *m2) /* Initialize the character class tables. */ static void @@ -1854,7 +3185,7 @@ index 6d2eec5..f189a0d 100644 { size_t i; -@@ -1301,7 +1372,7 @@ inittables (void) +@@ -1299,7 +1370,7 @@ inittables (void) fold_toupper[i] = toupper (i); } @@ -1863,7 +3194,7 @@ index 6d2eec5..f189a0d 100644 /* If we're not in the "C" locale, read different names for months. */ if (hard_LC_TIME) { -@@ -1383,6 +1454,84 @@ specify_nmerge (int oi, char c, char const *s) +@@ -1381,6 +1452,84 @@ specify_nmerge (int oi, char c, char const *s) xstrtol_fatal (e, oi, c, long_options, s); } @@ -1948,7 +3279,7 @@ index 6d2eec5..f189a0d 100644 /* Specify the amount of main memory to use when sorting. */ static void specify_sort_size (int oi, char c, char const *s) -@@ -1614,7 +1763,7 @@ buffer_linelim (struct buffer const *buf) +@@ -1612,7 +1761,7 @@ buffer_linelim (struct buffer const *buf) by KEY in LINE. */ static char * @@ -1957,7 +3288,7 @@ index 6d2eec5..f189a0d 100644 { char *ptr = line->text, *lim = ptr + line->length - 1; size_t sword = key->sword; -@@ -1623,10 +1772,10 @@ begfield (struct line const *line, struct keyfield const *key) +@@ -1621,10 +1770,10 @@ begfield (struct line const *line, struct keyfield const *key) /* The leading field separator itself is included in a field when -t is absent. */ @@ -1970,7 +3301,7 @@ index 6d2eec5..f189a0d 100644 ++ptr; if (ptr < lim) ++ptr; -@@ -1652,11 +1801,70 @@ begfield (struct line const *line, struct keyfield const *key) +@@ -1650,12 +1799,71 @@ begfield (struct line const *line, struct keyfield const *key) return ptr; } @@ -2036,13 +3367,14 @@ index 6d2eec5..f189a0d 100644 /* Return the limit of (a pointer to the first character after) the field in LINE specified by KEY. */ + ATTRIBUTE_PURE static char * -limfield (struct line const *line, struct keyfield const *key) -+limfield_uni (const struct line *line, const struct keyfield *key) ++limfield_uni (struct line const *line, struct keyfield const *key) { char *ptr = line->text, *lim = ptr + line->length - 1; size_t eword = key->eword, echar = key->echar; -@@ -1671,10 +1879,10 @@ limfield (struct line const *line, struct keyfield const *key) +@@ -1670,10 +1878,10 @@ limfield (struct line const *line, struct keyfield const *key) 'beginning' is the first character following the delimiting TAB. Otherwise, leave PTR pointing at the first 'blank' character after the preceding field. */ @@ -2055,7 +3387,7 @@ index 6d2eec5..f189a0d 100644 ++ptr; if (ptr < lim && (eword || echar)) ++ptr; -@@ -1720,10 +1928,10 @@ limfield (struct line const *line, struct keyfield const *key) +@@ -1719,10 +1927,10 @@ limfield (struct line const *line, struct keyfield const *key) */ /* Make LIM point to the end of (one byte past) the current field. */ @@ -2068,12 +3400,12 @@ index 6d2eec5..f189a0d 100644 if (newlim) lim = newlim; } -@@ -1754,6 +1962,130 @@ limfield (struct line const *line, struct keyfield const *key) +@@ -1753,6 +1961,130 @@ limfield (struct line const *line, struct keyfield const *key) return ptr; } +#if HAVE_MBRTOWC -+static char * ++static char * _GL_ATTRIBUTE_PURE +limfield_mb (const struct line *line, const struct keyfield *key) +{ + char *ptr = line->text, *lim = ptr + line->length - 1; @@ -2199,7 +3531,7 @@ index 6d2eec5..f189a0d 100644 /* Fill BUF reading from FP, moving buf->left bytes from the end of buf->buf to the beginning first. If EOF is reached and the file wasn't terminated by a newline, supply one. Set up BUF's line -@@ -1840,8 +2172,22 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file) +@@ -1839,8 +2171,22 @@ fillbuf (struct buffer *buf, FILE *fp, char const *file) else { if (key->skipsblanks) @@ -2224,16 +3556,32 @@ index 6d2eec5..f189a0d 100644 line->keybeg = line_start; } } -@@ -1991,7 +2337,7 @@ human_numcompare (char const *a, char const *b) - hideously fast. */ +@@ -1976,12 +2322,10 @@ find_unit_order (char const *number) + ATTRIBUTE_PURE + static int +-human_numcompare (char const *a, char const *b) ++human_numcompare (char *a, char *b) + { +- while (blanks[to_uchar (*a)]) +- a++; +- while (blanks[to_uchar (*b)]) +- b++; ++ skipblanks(&a, a + strlen(a)); ++ skipblanks(&b, b + strlen(b)); + + int diff = find_unit_order (a) - find_unit_order (b); + return (diff ? diff : strnumcmp (a, b, decimal_point, thousands_sep)); +@@ -1993,7 +2337,7 @@ human_numcompare (char const *a, char const *b) + + ATTRIBUTE_PURE static int -numcompare (char const *a, char const *b) +numcompare_uni (const char *a, const char *b) { while (blanks[to_uchar (*a)]) a++; -@@ -2001,6 +2347,25 @@ numcompare (char const *a, char const *b) +@@ -2003,6 +2347,25 @@ numcompare (char const *a, char const *b) return strnumcmp (a, b, decimal_point, thousands_sep); } @@ -2259,7 +3607,7 @@ index 6d2eec5..f189a0d 100644 /* Work around a problem whereby the long double value returned by glibc's strtold ("NaN", ...) contains uninitialized bits: clear all bytes of A and B before calling strtold. FIXME: remove this function if -@@ -2051,7 +2416,7 @@ general_numcompare (char const *sa, char const *sb) +@@ -2053,7 +2416,7 @@ general_numcompare (char const *sa, char const *sb) Return 0 if the name in S is not recognized. */ static int @@ -2268,7 +3616,7 @@ index 6d2eec5..f189a0d 100644 { size_t lo = 0; size_t hi = MONTHS_PER_YEAR; -@@ -2327,15 +2692,14 @@ debug_key (struct line const *line, struct keyfield const *key) +@@ -2329,15 +2692,14 @@ debug_key (struct line const *line, struct keyfield const *key) char saved = *lim; *lim = '\0'; @@ -2286,7 +3634,7 @@ index 6d2eec5..f189a0d 100644 else if (key->general_numeric) ignore_value (strtold (beg, &tighter_lim)); else if (key->numeric || key->human_numeric) -@@ -2469,7 +2833,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) +@@ -2483,7 +2845,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) /* Warn about significant leading blanks. */ bool implicit_skip = key_numeric (key) || key->month; bool line_offset = key->eword == 0 && key->echar != 0; /* -k1.x,1.y */ @@ -2295,7 +3643,66 @@ index 6d2eec5..f189a0d 100644 && ((!key->skipsblanks && !implicit_skip) || (!key->skipsblanks && key->schar) || (!key->skipeblanks && key->echar))) -@@ -2527,11 +2891,87 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) +@@ -2531,9 +2893,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) + bool number_locale_warned = false; + if (basic_numeric_field_span) + { +- if (tab == TAB_DEFAULT +- ? thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep))) +- : tab == thousands_sep) ++ if (tab_length ++ ? tab[0] == thousands_sep ++ : thousands_sep != NON_CHAR && (isblank (to_uchar (thousands_sep)))) + { + error (0, 0, + _("field separator %s is treated as a " +@@ -2544,9 +2906,9 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) + } + if (basic_numeric_field_span || general_numeric_field_span) + { +- if (tab == TAB_DEFAULT +- ? thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point))) +- : tab == decimal_point) ++ if (tab_length ++ ? tab[0] == decimal_point ++ : thousands_sep != NON_CHAR && (isblank (to_uchar (decimal_point)))) + { + error (0, 0, + _("field separator %s is treated as a " +@@ -2554,19 +2916,19 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) + quote (((char []) {decimal_point, 0}))); + number_locale_warned = true; + } +- else if (tab == '-') ++ else if (tab_length && tab[0] == '-') + { + error (0, 0, + _("field separator %s is treated as a " + "minus sign in numbers"), +- quote (((char []) {tab, 0}))); ++ quote (((char []) {tab[0], 0}))); + } +- else if (general_numeric_field_span && tab == '+') ++ else if (general_numeric_field_span && tab_length && tab[0] == '+') + { + error (0, 0, + _("field separator %s is treated as a " + "plus sign in numbers"), +- quote (((char []) {tab, 0}))); ++ quote (((char []) {tab[0], 0}))); + } + } + +@@ -2577,7 +2939,7 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) + { + error (0, 0, + _("%snumbers use %s as a decimal point in this locale"), +- tab == decimal_point ? "" : _("note "), ++ (tab_length && tab[0] == decimal_point) ? "" : _("note "), + quote (((char []) {decimal_point, 0}))); + + } +@@ -2610,11 +2972,87 @@ key_warnings (struct keyfield const *gkey, bool gkey_only) error (0, 0, _("option '-r' only applies to last-resort comparison")); } @@ -2384,7 +3791,7 @@ index 6d2eec5..f189a0d 100644 { struct keyfield *key = keylist; -@@ -2616,7 +3056,7 @@ keycompare (struct line const *a, struct line const *b) +@@ -2699,7 +3137,7 @@ keycompare (struct line const *a, struct line const *b) else if (key->human_numeric) diff = human_numcompare (ta, tb); else if (key->month) @@ -2393,7 +3800,7 @@ index 6d2eec5..f189a0d 100644 else if (key->random) diff = compare_random (ta, tlena, tb, tlenb); else if (key->version) -@@ -2732,6 +3172,211 @@ keycompare (struct line const *a, struct line const *b) +@@ -2815,6 +3253,211 @@ keycompare (struct line const *a, struct line const *b) return key->reverse ? -diff : diff; } @@ -2605,7 +4012,7 @@ index 6d2eec5..f189a0d 100644 /* Compare two lines A and B, returning negative, zero, or positive depending on whether A compares less than, equal to, or greater than B. */ -@@ -2759,7 +3404,7 @@ compare (struct line const *a, struct line const *b) +@@ -2842,7 +3485,7 @@ compare (struct line const *a, struct line const *b) diff = - NONZERO (blen); else if (blen == 0) diff = 1; @@ -2614,7 +4021,7 @@ index 6d2eec5..f189a0d 100644 { /* xmemcoll0 is a performance enhancement as it will not unconditionally write '\0' after the -@@ -4149,6 +4794,7 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) +@@ -4226,6 +4869,7 @@ set_ordering (char const *s, struct keyfield *key, enum blanktype blanktype) break; case 'f': key->translate = fold_toupper; @@ -2622,7 +4029,7 @@ index 6d2eec5..f189a0d 100644 break; case 'g': key->general_numeric = true; -@@ -4228,7 +4874,7 @@ main (int argc, char **argv) +@@ -4305,7 +4949,7 @@ main (int argc, char **argv) initialize_exit_failure (SORT_FAILURE); hard_LC_COLLATE = hard_locale (LC_COLLATE); @@ -2631,8 +4038,8 @@ index 6d2eec5..f189a0d 100644 hard_LC_TIME = hard_locale (LC_TIME); #endif -@@ -4249,6 +4895,29 @@ main (int argc, char **argv) - thousands_sep = -1; +@@ -4328,6 +4972,29 @@ main (int argc, char **argv) + thousands_sep = NON_CHAR; } +#if HAVE_MBRTOWC @@ -2661,7 +4068,7 @@ index 6d2eec5..f189a0d 100644 have_read_stdin = false; inittables (); -@@ -4523,13 +5192,34 @@ main (int argc, char **argv) +@@ -4602,13 +5269,34 @@ main (int argc, char **argv) case 't': { @@ -2700,7 +4107,7 @@ index 6d2eec5..f189a0d 100644 else { /* Provoke with 'sort -txx'. Complain about -@@ -4540,9 +5230,11 @@ main (int argc, char **argv) +@@ -4619,9 +5307,11 @@ main (int argc, char **argv) quote (optarg)); } } @@ -2714,21 +4121,214 @@ index 6d2eec5..f189a0d 100644 } break; -@@ -4771,12 +5463,10 @@ main (int argc, char **argv) - sort (files, nfiles, outfile, nthreads); - } +diff --git a/src/unexpand.c b/src/unexpand.c +index 7d6100f..04cd646 100644 +--- a/src/unexpand.c ++++ b/src/unexpand.c +@@ -38,6 +38,9 @@ + #include + #include + #include ++ ++#include ++ + #include "system.h" + #include "die.h" --#ifdef lint - if (files_from) - readtokens0_free (&tok); - else - free (files); --#endif +@@ -106,24 +109,47 @@ unexpand (void) + { + /* Input stream. */ + FILE *fp = next_file (NULL); ++ mb_file_t mbf; + + /* The array of pending blanks. In non-POSIX locales, blanks can + include characters other than spaces, so the blanks must be + stored, not merely counted. */ +- char *pending_blank; ++ mbf_char_t *pending_blank; ++ /* True if the starting locale is utf8. */ ++ bool using_utf_locale; ++ ++ /* True if the first file contains BOM header. */ ++ bool found_bom; ++ using_utf_locale=check_utf_locale(); + + if (!fp) + return; ++ mbf_init (mbf, fp); ++ found_bom=check_bom(fp,&mbf); ++ ++ if (using_utf_locale == false && found_bom == true) ++ { ++ /*try using some predefined locale */ + ++ if (set_utf_locale () != 0) ++ { ++ error (EXIT_FAILURE, errno, _("cannot set UTF-8 locale")); ++ } ++ } + /* The worst case is a non-blank character, then one blank, then a + tab stop, then MAX_COLUMN_WIDTH - 1 blanks, then a non-blank; so + allocate MAX_COLUMN_WIDTH bytes to store the blanks. */ +- pending_blank = xmalloc (max_column_width); ++ pending_blank = xmalloc (max_column_width * sizeof (mbf_char_t)); ++ ++ if (found_bom == true) ++ { ++ print_bom(); ++ } + + while (true) + { + /* Input character, or EOF. */ +- int c; ++ mbf_char_t c; + + /* If true, perform translations. */ + bool convert = true; +@@ -157,12 +183,44 @@ unexpand (void) + + do + { +- while ((c = getc (fp)) < 0 && (fp = next_file (fp))) +- continue; ++ while (true) { ++ mbf_getc (c, mbf); ++ if ((mb_iseof (c)) && (fp = next_file (fp))) ++ { ++ mbf_init (mbf, fp); ++ if (fp!=NULL) ++ { ++ if (check_bom(fp,&mbf)==true) ++ { ++ /*Not the first file - check BOM header*/ ++ if (using_utf_locale==false && found_bom==false) ++ { ++ /*BOM header in subsequent file but not in the first one. */ ++ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); ++ } ++ } ++ else ++ { ++ if(using_utf_locale==false && found_bom==true) ++ { ++ /*First file conatined BOM header - locale was switched to UTF ++ *all subsequent files should contain BOM. */ ++ error (EXIT_FAILURE, errno, _("combination of files with and without BOM header")); ++ } ++ } ++ } ++ continue; ++ } ++ else ++ { ++ break; ++ } ++ } ++ + + if (convert) + { +- bool blank = !! isblank (c); ++ bool blank = mb_isblank (c); + + if (blank) + { +@@ -179,16 +237,16 @@ unexpand (void) + if (next_tab_column < column) + die (EXIT_FAILURE, 0, _("input line is too long")); + +- if (c == '\t') ++ if (mb_iseq (c, '\t')) + { + column = next_tab_column; + + if (pending) +- pending_blank[0] = '\t'; ++ mb_setascii (&pending_blank[0], '\t'); + } + else + { +- column++; ++ column += mb_width (c); + + if (! (prev_blank && column == next_tab_column)) + { +@@ -196,13 +254,14 @@ unexpand (void) + will be replaced by tabs. */ + if (column == next_tab_column) + one_blank_before_tab_stop = true; +- pending_blank[pending++] = c; ++ mb_copy (&pending_blank[pending++], &c); + prev_blank = true; + continue; + } + + /* Replace the pending blanks by a tab or two. */ +- pending_blank[0] = c = '\t'; ++ mb_setascii (&c, '\t'); ++ mb_setascii (&pending_blank[0], '\t'); + } + + /* Discard pending blanks, unless it was a single +@@ -210,7 +269,7 @@ unexpand (void) + pending = one_blank_before_tab_stop; + } + } +- else if (c == '\b') ++ else if (mb_iseq (c, '\b')) + { + /* Go back one column, and force recalculation of the + next tab stop. */ +@@ -218,9 +277,9 @@ unexpand (void) + next_tab_column = column; + tab_index -= !!tab_index; + } +- else ++ else if (!mb_iseq (c, '\n')) + { +- column++; ++ column += mb_width (c); + if (!column) + die (EXIT_FAILURE, 0, _("input line is too long")); + } +@@ -228,8 +287,11 @@ unexpand (void) + if (pending) + { + if (pending > 1 && one_blank_before_tab_stop) +- pending_blank[0] = '\t'; +- if (fwrite (pending_blank, 1, pending, stdout) != pending) ++ mb_setascii (&pending_blank[0], '\t'); ++ ++ for (int n = 0; n < pending; ++n) ++ mb_putc (pending_blank[n], stdout); ++ if (ferror (stdout)) + die (EXIT_FAILURE, errno, _("write error")); + pending = 0; + one_blank_before_tab_stop = false; +@@ -239,16 +301,17 @@ unexpand (void) + convert &= convert_entire_line || blank; + } + +- if (c < 0) ++ if (mb_iseof (c)) + { + free (pending_blank); + return; + } + +- if (putchar (c) < 0) ++ mb_putc (c, stdout); ++ if (ferror (stdout)) + die (EXIT_FAILURE, errno, _("write error")); + } +- while (c != '\n'); ++ while (!mb_iseq (c, '\n')); + } + } - if (have_read_stdin && fclose (stdin) == EOF) - sort_die (_("close failed"), "-"); diff --git a/src/uniq.c b/src/uniq.c -index 87a0c93..9f755d9 100644 +index e5996f0..871d47c 100644 --- a/src/uniq.c +++ b/src/uniq.c @@ -21,6 +21,17 @@ @@ -2749,12 +4349,8 @@ index 87a0c93..9f755d9 100644 #include "system.h" #include "argmatch.h" #include "linebuffer.h" -@@ -32,9 +43,21 @@ - #include "stdio--.h" - #include "xmemcoll.h" - #include "xstrtol.h" --#include "memcasecmp.h" -+#include "xmemcoll.h" +@@ -33,6 +44,18 @@ + #include "memcasecmp.h" #include "quote.h" +/* MB_LEN_MAX is incorrectly defined to be 1 in at least one GCC @@ -2772,7 +4368,7 @@ index 87a0c93..9f755d9 100644 /* The official name of this program (e.g., no 'g' prefix). */ #define PROGRAM_NAME "uniq" -@@ -144,6 +167,10 @@ enum +@@ -139,6 +162,10 @@ enum GROUP_OPTION = CHAR_MAX + 1 }; @@ -2783,16 +4379,16 @@ index 87a0c93..9f755d9 100644 static struct option const longopts[] = { {"count", no_argument, NULL, 'c'}, -@@ -260,7 +287,7 @@ size_opt (char const *opt, char const *msgid) - return a pointer to the beginning of the line's field to be compared. */ +@@ -254,7 +281,7 @@ size_opt (char const *opt, char const *msgid) - static char * _GL_ATTRIBUTE_PURE + ATTRIBUTE_PURE + static char * -find_field (struct linebuffer const *line) +find_field_uni (struct linebuffer *line) { size_t count; char const *lp = line->buffer; -@@ -280,6 +307,83 @@ find_field (struct linebuffer const *line) +@@ -274,6 +301,83 @@ find_field (struct linebuffer const *line) return line->buffer + i; } @@ -2876,228 +4472,7 @@ index 87a0c93..9f755d9 100644 /* Return false if two strings OLD and NEW match, true if not. OLD and NEW point not to the beginnings of the lines but rather to the beginnings of the fields to compare. -@@ -288,6 +392,8 @@ find_field (struct linebuffer const *line) - static bool - different (char *old, char *new, size_t oldlen, size_t newlen) - { -+ char *copy_old, *copy_new; -+ - if (check_chars < oldlen) - oldlen = check_chars; - if (check_chars < newlen) -@@ -295,15 +401,104 @@ different (char *old, char *new, size_t oldlen, size_t newlen) - - if (ignore_case) - { -- /* FIXME: This should invoke strcoll somehow. */ -- return oldlen != newlen || memcasecmp (old, new, oldlen); -+ size_t i; -+ -+ copy_old = xmalloc (oldlen + 1); -+ copy_new = xmalloc (oldlen + 1); -+ -+ for (i = 0; i < oldlen; i++) -+ { -+ copy_old[i] = toupper (old[i]); -+ copy_new[i] = toupper (new[i]); -+ } -+ bool rc = xmemcoll (copy_old, oldlen, copy_new, newlen); -+ free (copy_old); -+ free (copy_new); -+ return rc; - } -- else if (hard_LC_COLLATE) -- return xmemcoll (old, oldlen, new, newlen) != 0; - else -- return oldlen != newlen || memcmp (old, new, oldlen); -+ { -+ copy_old = (char *)old; -+ copy_new = (char *)new; -+ } -+ -+ return xmemcoll (copy_old, oldlen, copy_new, newlen); -+ - } - -+#if HAVE_MBRTOWC -+static int -+different_multi (const char *old, const char *new, size_t oldlen, size_t newlen, mbstate_t oldstate, mbstate_t newstate) -+{ -+ size_t i, j, chars; -+ const char *str[2]; -+ char *copy[2]; -+ size_t len[2]; -+ mbstate_t state[2]; -+ size_t mblength; -+ wchar_t wc, uwc; -+ mbstate_t state_bak; -+ -+ str[0] = old; -+ str[1] = new; -+ len[0] = oldlen; -+ len[1] = newlen; -+ state[0] = oldstate; -+ state[1] = newstate; -+ -+ for (i = 0; i < 2; i++) -+ { -+ copy[i] = xmalloc (len[i] + 1); -+ memset (copy[i], '\0', len[i] + 1); -+ -+ for (j = 0, chars = 0; j < len[i] && chars < check_chars; chars++) -+ { -+ state_bak = state[i]; -+ mblength = mbrtowc (&wc, str[i] + j, len[i] - j, &(state[i])); -+ -+ switch (mblength) -+ { -+ case (size_t)-1: -+ case (size_t)-2: -+ state[i] = state_bak; -+ /* Fall through */ -+ case 0: -+ mblength = 1; -+ break; -+ -+ default: -+ if (ignore_case) -+ { -+ uwc = towupper (wc); -+ -+ if (uwc != wc) -+ { -+ mbstate_t state_wc; -+ size_t mblen; -+ -+ memset (&state_wc, '\0', sizeof(mbstate_t)); -+ mblen = wcrtomb (copy[i] + j, uwc, &state_wc); -+ assert (mblen != (size_t)-1); -+ } -+ else -+ memcpy (copy[i] + j, str[i] + j, mblength); -+ } -+ else -+ memcpy (copy[i] + j, str[i] + j, mblength); -+ } -+ j += mblength; -+ } -+ copy[i][j] = '\0'; -+ len[i] = j; -+ } -+ int rc = xmemcoll (copy[0], len[0], copy[1], len[1]); -+ free (copy[0]); -+ free (copy[1]); -+ return rc; -+ -+} -+#endif -+ - /* Output the line in linebuffer LINE to standard output - provided that the switches say it should be output. - MATCH is true if the line matches the previous line. -@@ -367,19 +562,38 @@ check_file (const char *infile, const char *outfile, char delimiter) - char *prevfield IF_LINT ( = NULL); - size_t prevlen IF_LINT ( = 0); - bool first_group_printed = false; -+#if HAVE_MBRTOWC -+ mbstate_t prevstate; -+ -+ memset (&prevstate, '\0', sizeof (mbstate_t)); -+#endif - - while (!feof (stdin)) - { - char *thisfield; - size_t thislen; - bool new_group; -+#if HAVE_MBRTOWC -+ mbstate_t thisstate; -+#endif - - if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) - break; - - thisfield = find_field (thisline); - thislen = thisline->length - 1 - (thisfield - thisline->buffer); -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ { -+ thisstate = thisline->state; - -+ new_group = (prevline->length == 0 -+ || different_multi (thisfield, prevfield, -+ thislen, prevlen, -+ thisstate, prevstate)); -+ } -+ else -+#endif - new_group = (prevline->length == 0 - || different (thisfield, prevfield, thislen, prevlen)); - -@@ -397,6 +611,10 @@ check_file (const char *infile, const char *outfile, char delimiter) - SWAP_LINES (prevline, thisline); - prevfield = thisfield; - prevlen = thislen; -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ prevstate = thisstate; -+#endif - first_group_printed = true; - } - } -@@ -409,17 +627,26 @@ check_file (const char *infile, const char *outfile, char delimiter) - size_t prevlen; - uintmax_t match_count = 0; - bool first_delimiter = true; -+#if HAVE_MBRTOWC -+ mbstate_t prevstate; -+#endif - - if (readlinebuffer_delim (prevline, stdin, delimiter) == 0) - goto closefiles; - prevfield = find_field (prevline); - prevlen = prevline->length - 1 - (prevfield - prevline->buffer); -+#if HAVE_MBRTOWC -+ prevstate = prevline->state; -+#endif - - while (!feof (stdin)) - { - bool match; - char *thisfield; - size_t thislen; -+#if HAVE_MBRTOWC -+ mbstate_t thisstate = thisline->state; -+#endif - if (readlinebuffer_delim (thisline, stdin, delimiter) == 0) - { - if (ferror (stdin)) -@@ -428,6 +655,14 @@ check_file (const char *infile, const char *outfile, char delimiter) - } - thisfield = find_field (thisline); - thislen = thisline->length - 1 - (thisfield - thisline->buffer); -+#if HAVE_MBRTOWC -+ if (MB_CUR_MAX > 1) -+ { -+ match = !different_multi (thisfield, prevfield, -+ thislen, prevlen, thisstate, prevstate); -+ } -+ else -+#endif - match = !different (thisfield, prevfield, thislen, prevlen); - match_count += match; - -@@ -460,6 +695,9 @@ check_file (const char *infile, const char *outfile, char delimiter) - SWAP_LINES (prevline, thisline); - prevfield = thisfield; - prevlen = thislen; -+#if HAVE_MBRTOWC -+ prevstate = thisstate; -+#endif - if (!match) - match_count = 0; - } -@@ -506,6 +744,19 @@ main (int argc, char **argv) +@@ -494,6 +598,19 @@ main (int argc, char **argv) atexit (close_stdout); @@ -3117,6 +4492,209 @@ index 87a0c93..9f755d9 100644 skip_chars = 0; skip_fields = 0; check_chars = SIZE_MAX; +diff --git a/tests/Coreutils.pm b/tests/Coreutils.pm +index fad7ab9..c9021a6 100644 +--- a/tests/Coreutils.pm ++++ b/tests/Coreutils.pm +@@ -264,6 +264,9 @@ sub run_tests ($$$$$) + # Yes, this is an arbitrary limit. If it causes trouble, + # consider removing it. + my $max = 30; ++ # The downstream i18n multi-byte tests have a "-mb" suffix. ++ # Therefore add 3 to the maximum test name length. ++ $max += 3; + if ($max < length $test_name) + { + warn "$program_name: $test_name: test name is too long (> $max)\n"; +diff --git a/tests/expand/mb.sh b/tests/expand/mb.sh +new file mode 100755 +index 0000000..dd6007c +--- /dev/null ++++ b/tests/expand/mb.sh +@@ -0,0 +1,183 @@ ++#!/bin/sh ++ ++# Copyright (C) 2012-2015 Free Software Foundation, Inc. ++ ++# This program is free software: you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation, either version 3 of the License, or ++# (at your option) any later version. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++ ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src ++print_ver_ expand ++ ++export LC_ALL=en_US.UTF-8 ++ ++#input containing multibyte characters ++cat <<\EOF > in || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++EOF ++env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ ++ ++cat <<\EOF > exp || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++expand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#multiple files as an input ++cat <<\EOF >> exp || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++expand ./in ./in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#test characters with display widths != 1 ++env printf '12345678 ++e\t|ascii(1) ++\u00E9\t|composed(1) ++e\u0301\t|decomposed(1) ++\u3000\t|ideo-space(2) ++\uFF0D\t|full-hypen(2) ++' > in || framework_failure_ ++ ++env printf '12345678 ++e |ascii(1) ++\u00E9 |composed(1) ++e\u0301 |decomposed(1) ++\u3000 |ideo-space(2) ++\uFF0D |full-hypen(2) ++' > exp || framework_failure_ ++ ++expand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#shouldn't fail with "input line too long" ++#when a line starts with a control character ++env printf '\n' > in || framework_failure_ ++ ++expand < in > out || fail=1 ++compare in out > /dev/null 2>&1 || fail=1 ++ ++#non-Unicode characters interspersed between Unicode ones ++env printf '12345678 ++\t\xFF| ++\xFF\t| ++\t\xFFä| ++ä\xFF\t| ++\tä\xFF| ++\xFF\tä| ++äbcdef\xFF\t| ++' > in || framework_failure_ ++ ++env printf '12345678 ++ \xFF| ++\xFF | ++ \xFFä| ++ä\xFF | ++ ä\xFF| ++\xFF ä| ++äbcdef\xFF | ++' > exp || framework_failure_ ++ ++expand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++ ++ ++#BOM header test 1 ++printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++EOF ++env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ ++ ++printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++ ++expand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LANG=C expand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LC_ALL=C expand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++ ++printf '\xEF\xBB\xBF' > in1; cat <<\EOF >> in1 || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++EOF ++env printf ' äöü\t. öüä. \tä xx\n' >> in1 || framework_failure_ ++ ++ ++printf '\xEF\xBB\xBF' > exp; cat <<\EOF >> exp || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++expand in1 in1 > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LANG=C expand in1 in1 > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LC_ALL=C expand in1 in1 > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++exit $fail diff --git a/tests/i18n/sort.sh b/tests/i18n/sort.sh new file mode 100755 index 0000000..26c95de @@ -3153,10 +4731,10 @@ index 0000000..26c95de + +Exit $fail diff --git a/tests/local.mk b/tests/local.mk -index 568944e..192f776 100644 +index 0f77786..dbe1843 100644 --- a/tests/local.mk +++ b/tests/local.mk -@@ -368,6 +368,8 @@ all_tests = \ +@@ -377,6 +377,8 @@ all_tests = \ tests/misc/sort-discrim.sh \ tests/misc/sort-files0-from.pl \ tests/misc/sort-float.sh \ @@ -3165,8 +4743,24 @@ index 568944e..192f776 100644 tests/misc/sort-h-thousands-sep.sh \ tests/misc/sort-merge.pl \ tests/misc/sort-merge-fdlimit.sh \ +@@ -576,6 +578,7 @@ all_tests = \ + tests/du/threshold.sh \ + tests/du/trailing-slash.sh \ + tests/du/two-args.sh \ ++ tests/expand/mb.sh \ + tests/id/gnu-zero-uids.sh \ + tests/id/no-context.sh \ + tests/id/context.sh \ +@@ -727,6 +730,7 @@ all_tests = \ + tests/touch/read-only.sh \ + tests/touch/relative.sh \ + tests/touch/trailing-slash.sh \ ++ tests/unexpand/mb.sh \ + $(all_root_tests) + + # See tests/factor/create-test.sh. diff --git a/tests/misc/expand.pl b/tests/misc/expand.pl -index 8a9cad1..9293e39 100755 +index 7a77e6f..27f6652 100755 --- a/tests/misc/expand.pl +++ b/tests/misc/expand.pl @@ -27,6 +27,15 @@ my $prog = 'expand'; @@ -3233,7 +4827,7 @@ index 8a9cad1..9293e39 100755 my $verbose = $ENV{VERBOSE}; diff --git a/tests/misc/fold.pl b/tests/misc/fold.pl -index 7b192b4..76f073f 100755 +index 2834f92..bc1616a 100755 --- a/tests/misc/fold.pl +++ b/tests/misc/fold.pl @@ -20,9 +20,18 @@ use strict; @@ -3306,7 +4900,7 @@ index 7b192b4..76f073f 100755 my $fail = run_tests ($program_name, $prog, \@Tests, $save_temps, $verbose); exit $fail; diff --git a/tests/misc/join.pl b/tests/misc/join.pl -index 4d399d8..07f2823 100755 +index 06ad777..be40204 100755 --- a/tests/misc/join.pl +++ b/tests/misc/join.pl @@ -25,6 +25,15 @@ my $limits = getlimits (); @@ -3427,7 +5021,7 @@ index 0000000..11836ba + +Exit $fail diff --git a/tests/misc/sort-merge.pl b/tests/misc/sort-merge.pl -index 23f6ed2..402a987 100755 +index 7eb4574..eda884c 100755 --- a/tests/misc/sort-merge.pl +++ b/tests/misc/sort-merge.pl @@ -26,6 +26,15 @@ my $prog = 'sort'; @@ -3487,7 +5081,7 @@ index 23f6ed2..402a987 100755 my $verbose = $ENV{VERBOSE}; diff --git a/tests/misc/sort.pl b/tests/misc/sort.pl -index c3e7f8e..6ecd3ff 100755 +index 0b0adca..fd27821 100755 --- a/tests/misc/sort.pl +++ b/tests/misc/sort.pl @@ -24,10 +24,15 @@ my $prog = 'sort'; @@ -3555,7 +5149,7 @@ index c3e7f8e..6ecd3ff 100755 my $save_temps = $ENV{DEBUG}; my $verbose = $ENV{VERBOSE}; diff --git a/tests/misc/unexpand.pl b/tests/misc/unexpand.pl -index 6ba6d40..de86723 100755 +index 2e1906f..fe66012 100755 --- a/tests/misc/unexpand.pl +++ b/tests/misc/unexpand.pl @@ -27,6 +27,14 @@ my $limits = getlimits (); @@ -3612,7 +5206,7 @@ index 6ba6d40..de86723 100755 my $verbose = $ENV{VERBOSE}; diff --git a/tests/misc/uniq.pl b/tests/misc/uniq.pl -index f028036..8eaf59a 100755 +index aa163cd..91d617d 100755 --- a/tests/misc/uniq.pl +++ b/tests/misc/uniq.pl @@ -23,9 +23,17 @@ my $limits = getlimits (); @@ -3688,7 +5282,7 @@ index f028036..8eaf59a 100755 @Tests = triple_test \@Tests; diff --git a/tests/pr/pr-tests.pl b/tests/pr/pr-tests.pl -index ec3980a..136657d 100755 +index 7ac6d4c..ae6cc35 100755 --- a/tests/pr/pr-tests.pl +++ b/tests/pr/pr-tests.pl @@ -24,6 +24,15 @@ use strict; @@ -3707,9 +5301,9 @@ index ec3980a..136657d 100755 my @tv = ( # -b option is no longer an official option. But it's still working to -@@ -474,8 +483,48 @@ push @Tests, - {IN=>{2=>"a\n"}}, - {OUT=>"a\t\t\t\t \t\t\ta\n"} ]; +@@ -512,8 +521,48 @@ push @Tests, + {IN=>"x\tx\tx\tx\tx\nx\tx\tx\tx\tx\n"}, + {OUT=>"x\tx\tx\tx\tx\tx\tx\tx\tx\tx\n"} ]; +# Add _POSIX2_VERSION=199209 to the environment of each test +# that uses an old-style option like +1. @@ -3756,6 +5350,184 @@ index ec3980a..136657d 100755 my $save_temps = $ENV{DEBUG}; my $verbose = $ENV{VERBOSE}; +diff --git a/tests/unexpand/mb.sh b/tests/unexpand/mb.sh +new file mode 100755 +index 0000000..8a82d74 +--- /dev/null ++++ b/tests/unexpand/mb.sh +@@ -0,0 +1,172 @@ ++#!/bin/sh ++ ++# Copyright (C) 2012-2015 Free Software Foundation, Inc. ++ ++# This program is free software: you can redistribute it and/or modify ++# it under the terms of the GNU General Public License as published by ++# the Free Software Foundation, either version 3 of the License, or ++# (at your option) any later version. ++ ++# This program is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++# GNU General Public License for more details. ++ ++# You should have received a copy of the GNU General Public License ++# along with this program. If not, see . ++ ++. "${srcdir=.}/tests/init.sh"; path_prepend_ ./src ++print_ver_ unexpand ++ ++export LC_ALL=en_US.UTF-8 ++ ++#input containing multibyte characters ++cat > in <<\EOF ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++cat > exp <<\EOF ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++unexpand -a < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++ ++#multiple files as an input ++cat >> exp <<\EOF ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++ ++unexpand -a ./in ./in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#test characters with a display width larger than 1 ++ ++env printf '12345678 ++e |ascii(1) ++\u00E9 |composed(1) ++e\u0301 |decomposed(1) ++\u3000 |ideo-space(2) ++\uFF0D |full-hypen(2) ++' > in || framework_failure_ ++ ++env printf '12345678 ++e\t|ascii(1) ++\u00E9\t|composed(1) ++e\u0301\t|decomposed(1) ++\u3000\t|ideo-space(2) ++\uFF0D\t|full-hypen(2) ++' > exp || framework_failure_ ++ ++unexpand -a < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#test input where a blank of width > 1 is not being substituted ++in="$(LC_ALL=en_US.UTF-8 printf ' \u3000 ö ü ß')" ++exp='   ö ü ß' ++ ++unexpand -a < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#non-Unicode characters interspersed between Unicode ones ++env printf '12345678 ++ \xFF| ++\xFF | ++ \xFFä| ++ä\xFF | ++ ä\xFF| ++\xFF ä| ++äbcdef\xFF | ++' > in || framework_failure_ ++ ++env printf '12345678 ++\t\xFF| ++\xFF\t| ++\t\xFFä| ++ä\xFF\t| ++\tä\xFF| ++\xFF\tä| ++äbcdef\xFF\t| ++' > exp || framework_failure_ ++ ++unexpand -a < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++#BOM header test 1 ++printf "\xEF\xBB\xBF" > in; cat <<\EOF >> in || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++env printf ' äöü\t. öüä. \tä xx\n' >> in || framework_failure_ ++ ++printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++unexpand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LANG=C unexpand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LC_ALL=C unexpand < in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++ ++printf "\xEF\xBB\xBF" > exp; cat <<\EOF >> exp || framework_failure_ ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++1234567812345678123456781 ++. . . . ++a b c d ++. . . . ++ä ö ü ß ++. . . . ++ äöü . öüä. ä xx ++EOF ++ ++ ++unexpand in in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LANG=C unexpand in in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 ++ ++LC_ALL=C unexpand in in > out || fail=1 ++compare exp out > /dev/null 2>&1 || fail=1 -- -2.7.4 +2.34.1