[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def_args):
From: |
Patrice Dumas |
Subject: |
branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded Unicode characters for source marks locations. |
Date: |
Mon, 30 Jan 2023 17:27:15 -0500 |
This is an automated email from the git hooks/post-receive script.
pertusus pushed a commit to branch master
in repository texinfo.
The following commit(s) were added to refs/heads/master by this push:
new 282701e238 * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count
UTF-8 encoded Unicode characters for source marks locations.
282701e238 is described below
commit 282701e238e01fce8b36d078f8abf488369e58e4
Author: Patrice Dumas <pertusus@free.fr>
AuthorDate: Mon Jan 30 23:27:05 2023 +0100
* tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded
Unicode characters for source marks locations.
* tp/t/19def.t: do not skip end_of_lines_protected_non_ascii test.
* tp/Texinfo/XS/parsetexi/source_marks.c,
tp/Texinfo/XS/parsetexi/parser.c (count_convert_u8): move
count_convert_u8 to parser.c.
---
ChangeLog | 11 +++++++++++
tp/Texinfo/XS/parsetexi/def.c | 28 +++++++++++++++++++++++++---
tp/Texinfo/XS/parsetexi/parser.c | 17 +++++++++++++++++
tp/Texinfo/XS/parsetexi/parser.h | 1 +
tp/Texinfo/XS/parsetexi/source_marks.c | 13 -------------
tp/t/19def.t | 2 +-
6 files changed, 55 insertions(+), 17 deletions(-)
diff --git a/ChangeLog b/ChangeLog
index 554ccfaced..c3782056ad 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,14 @@
+2023-01-30 Patrice Dumas <pertusus@free.fr>
+
+ * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded
+ Unicode characters for source marks locations.
+
+ * tp/t/19def.t: do not skip end_of_lines_protected_non_ascii test.
+
+ * tp/Texinfo/XS/parsetexi/source_marks.c,
+ tp/Texinfo/XS/parsetexi/parser.c (count_convert_u8): move
+ count_convert_u8 to parser.c.
+
2023-01-30 Patrice Dumas <pertusus@free.fr>
* tp/Texinfo/XS/parsetexi/source_marks.c (count_convert_u8)
diff --git a/tp/Texinfo/XS/parsetexi/def.c b/tp/Texinfo/XS/parsetexi/def.c
index 52c1ffaac5..306d533fe8 100644
--- a/tp/Texinfo/XS/parsetexi/def.c
+++ b/tp/Texinfo/XS/parsetexi/def.c
@@ -15,6 +15,9 @@
#include <config.h>
#include <string.h>
+#include <stdbool.h>
+#include "uniconv.h"
+#include "unistr.h"
#include "parser.h"
#include "text.h"
@@ -201,20 +204,32 @@ split_def_args (ELEMENT *current, int starting_idx)
char *p;
ELEMENT *new;
int len;
- int current_position = 0;
- int previous_position = 0;
+ /* count UTF-8 encoded Unicode characters for source marks locations */
+ size_t current_position = 0;
+ size_t previous_position = 0;
+ uint8_t *u8_text = 0;
+ uint8_t *u8_p;
+
if (e->type == ET_bracketed)
{
isolate_last_space (e);
e->type = ET_bracketed_def_content;
continue;
}
+
if (e->text.end == 0)
continue;
+
p = e->text.text;
+ if (e->source_mark_list.number)
+ u8_text = u8_strconv_from_encoding (p, "UTF-8",
+ iconveh_question_mark);
+ u8_p = u8_text;
+
while (1)
{
+ size_t u8_len = 0;
len = strspn (p, whitespace_chars);
if (len)
{
@@ -226,7 +241,13 @@ split_def_args (ELEMENT *current, int starting_idx)
len = strcspn (p, whitespace_chars);
new = new_element (ET_NONE);
}
- current_position += len;
+ if (u8_text)
+ {
+ u8_len = u8_mbsnlen (u8_p, len);
+ u8_p += u8_len;
+ current_position += u8_len;
+ }
+
while (e->source_mark_list.number)
{
SOURCE_MARK *source_mark
@@ -251,6 +272,7 @@ split_def_args (ELEMENT *current, int starting_idx)
previous_position = current_position;
}
destroy_element (remove_from_contents (current, i--));
+ free (u8_text);
}
}
diff --git a/tp/Texinfo/XS/parsetexi/parser.c b/tp/Texinfo/XS/parsetexi/parser.c
index 0c369ffc9c..0a51799594 100644
--- a/tp/Texinfo/XS/parsetexi/parser.c
+++ b/tp/Texinfo/XS/parsetexi/parser.c
@@ -18,6 +18,9 @@
#include <stdlib.h>
#include <stdio.h>
#include <ctype.h>
+#include <stdbool.h>
+#include "uniconv.h"
+#include "unistr.h"
#include "parser.h"
#include "text.h"
@@ -31,6 +34,20 @@ const char *digit_chars = "0123456789";
// [^\S\r\n] in Perl
const char *whitespace_chars_except_newline = " \t\v\f";
+/* count characters, not bytes. */
+size_t
+count_convert_u8 (char *text)
+{
+ /* FIXME error checking? */
+ uint8_t *resultbuf = u8_strconv_from_encoding (text, "UTF-8",
+ iconveh_question_mark);
+ size_t result = u8_mbsnlen (resultbuf, u8_strlen (resultbuf));
+
+ free (resultbuf);
+
+ return result;
+}
+
/* Check if the contents of S2 appear at S1). */
int
looking_at (char *s1, char *s2)
diff --git a/tp/Texinfo/XS/parsetexi/parser.h b/tp/Texinfo/XS/parsetexi/parser.h
index fc0c991dd6..4bf185908d 100644
--- a/tp/Texinfo/XS/parsetexi/parser.h
+++ b/tp/Texinfo/XS/parsetexi/parser.h
@@ -152,6 +152,7 @@ ELEMENT *handle_separator (ELEMENT *current, char separator,
char **line_inout);
/* In parser.c */
+size_t count_convert_u8 (char *text);
ELEMENT *parse_texi (ELEMENT *root_elt, ELEMENT *current_elt);
void push_conditional_stack (enum command_id cond);
enum command_id pop_conditional_stack (void);
diff --git a/tp/Texinfo/XS/parsetexi/source_marks.c
b/tp/Texinfo/XS/parsetexi/source_marks.c
index 8d23fd727f..51ef605816 100644
--- a/tp/Texinfo/XS/parsetexi/source_marks.c
+++ b/tp/Texinfo/XS/parsetexi/source_marks.c
@@ -14,14 +14,10 @@
along with this program. If not, see <http://www.gnu.org/licenses/>. */
#include <string.h>
-#include <stdbool.h>
-#include "uniconv.h"
-#include "unistr.h"
#include "source_marks.h"
#include "tree.h"
#include "errors.h"
-/* for debugging only */
#include "parser.h"
int include_counter = 0;
@@ -72,15 +68,6 @@ add_source_marks (SOURCE_MARK_LIST *source_mark_list,
ELEMENT *e)
}
}
-/* count characters, not bytes. */
-size_t
-count_convert_u8 (char *text)
-{
- uint8_t *resultbuf = u8_strconv_from_encoding (text, "UTF-8",
- iconveh_question_mark);
- return u8_mbsnlen (resultbuf, u8_strlen (resultbuf));
-}
-
/* ELEMENT should be the parent container.
The source mark is put in the last content if it is text
or registered in the parent container. */
diff --git a/tp/t/19def.t b/tp/t/19def.t
index ef08c9695b..3a1b4c0876 100644
--- a/tp/t/19def.t
+++ b/tp/t/19def.t
@@ -124,7 +124,7 @@ deffn
@end deffn
'],
['end_of_lines_protected_non_ascii',
-undef, {'test_file' => 'end_of_lines_protected_non_ascii.texi',},# 'skip' =>
'XS counts bytes not characters' },
+undef, {'test_file' => 'end_of_lines_protected_non_ascii.texi',},
],
['empty_def_command',
'@deffn empty deffn
[Prev in Thread] |
Current Thread |
[Next in Thread] |
- branch master updated: * tp/Texinfo/XS/parsetexi/def.c (split_def_args): count UTF-8 encoded Unicode characters for source marks locations.,
Patrice Dumas <=