From 5fab7a678572c146d59089e35a4682f74670ea2b Mon Sep 17 00:00:00 2001 From: Alexander Borisov Date: Thu, 5 Dec 2024 15:38:01 +0300 Subject: [PATCH] Add url data type to contrib. --- contrib/Makefile | 6 + contrib/meson.build | 1 + contrib/url/Makefile | 37 + contrib/url/README | 137 + contrib/url/expected/url.out | 478 +++ contrib/url/expected/url_1.out | 9 + contrib/url/lexbor/core/array.c | 210 ++ contrib/url/lexbor/core/array.h | 100 + contrib/url/lexbor/core/array_obj.c | 218 ++ contrib/url/lexbor/core/array_obj.h | 134 + contrib/url/lexbor/core/base.h | 103 + contrib/url/lexbor/core/bst.c | 471 +++ contrib/url/lexbor/core/bst.h | 108 + contrib/url/lexbor/core/conv.c | 346 ++ contrib/url/lexbor/core/conv.h | 61 + contrib/url/lexbor/core/def.h | 57 + contrib/url/lexbor/core/diyfp.c | 153 + contrib/url/lexbor/core/diyfp.h | 258 ++ contrib/url/lexbor/core/dobject.c | 187 + contrib/url/lexbor/core/dobject.h | 92 + contrib/url/lexbor/core/dtoa.c | 404 +++ contrib/url/lexbor/core/dtoa.h | 28 + contrib/url/lexbor/core/lexbor.h | 43 + contrib/url/lexbor/core/mem.c | 228 ++ contrib/url/lexbor/core/mem.h | 141 + contrib/url/lexbor/core/memory.c | 53 + contrib/url/lexbor/core/mraw.c | 429 +++ contrib/url/lexbor/core/mraw.h | 114 + contrib/url/lexbor/core/plog.c | 73 + contrib/url/lexbor/core/plog.h | 102 + contrib/url/lexbor/core/serialize.h | 24 + contrib/url/lexbor/core/shs.c | 118 + contrib/url/lexbor/core/shs.h | 82 + contrib/url/lexbor/core/str.c | 642 ++++ contrib/url/lexbor/core/str.h | 252 ++ contrib/url/lexbor/core/str_res.h | 420 +++ contrib/url/lexbor/core/strtod.c | 326 ++ contrib/url/lexbor/core/strtod.h | 28 + contrib/url/lexbor/core/swar.h | 97 + contrib/url/lexbor/core/types.h | 39 + contrib/url/lexbor/core/utils.h | 29 + contrib/url/lexbor/url/base.h | 32 + contrib/url/lexbor/url/url.c | 5049 +++++++++++++++++++++++++++ contrib/url/lexbor/url/url.h | 581 +++ contrib/url/meson.build | 52 + contrib/url/sql/url.sql | 126 + contrib/url/url--1.0.sql | 141 + contrib/url/url.c | 994 ++++++ contrib/url/url.control | 5 + 49 files changed, 13818 insertions(+) create mode 100644 contrib/url/Makefile create mode 100644 contrib/url/README create mode 100644 contrib/url/expected/url.out create mode 100644 contrib/url/expected/url_1.out create mode 100644 contrib/url/lexbor/core/array.c create mode 100644 contrib/url/lexbor/core/array.h create mode 100644 contrib/url/lexbor/core/array_obj.c create mode 100644 contrib/url/lexbor/core/array_obj.h create mode 100644 contrib/url/lexbor/core/base.h create mode 100644 contrib/url/lexbor/core/bst.c create mode 100644 contrib/url/lexbor/core/bst.h create mode 100644 contrib/url/lexbor/core/conv.c create mode 100644 contrib/url/lexbor/core/conv.h create mode 100644 contrib/url/lexbor/core/def.h create mode 100644 contrib/url/lexbor/core/diyfp.c create mode 100644 contrib/url/lexbor/core/diyfp.h create mode 100644 contrib/url/lexbor/core/dobject.c create mode 100644 contrib/url/lexbor/core/dobject.h create mode 100644 contrib/url/lexbor/core/dtoa.c create mode 100644 contrib/url/lexbor/core/dtoa.h create mode 100644 contrib/url/lexbor/core/lexbor.h create mode 100644 contrib/url/lexbor/core/mem.c create mode 100644 contrib/url/lexbor/core/mem.h create mode 100644 contrib/url/lexbor/core/memory.c create mode 100644 contrib/url/lexbor/core/mraw.c create mode 100644 contrib/url/lexbor/core/mraw.h create mode 100644 contrib/url/lexbor/core/plog.c create mode 100644 contrib/url/lexbor/core/plog.h create mode 100644 contrib/url/lexbor/core/serialize.h create mode 100644 contrib/url/lexbor/core/shs.c create mode 100644 contrib/url/lexbor/core/shs.h create mode 100644 contrib/url/lexbor/core/str.c create mode 100644 contrib/url/lexbor/core/str.h create mode 100644 contrib/url/lexbor/core/str_res.h create mode 100644 contrib/url/lexbor/core/strtod.c create mode 100644 contrib/url/lexbor/core/strtod.h create mode 100644 contrib/url/lexbor/core/swar.h create mode 100644 contrib/url/lexbor/core/types.h create mode 100644 contrib/url/lexbor/core/utils.h create mode 100644 contrib/url/lexbor/url/base.h create mode 100644 contrib/url/lexbor/url/url.c create mode 100644 contrib/url/lexbor/url/url.h create mode 100644 contrib/url/meson.build create mode 100644 contrib/url/sql/url.sql create mode 100644 contrib/url/url--1.0.sql create mode 100644 contrib/url/url.c create mode 100644 contrib/url/url.control diff --git a/contrib/Makefile b/contrib/Makefile index 952855d9b6..2d81ef97f0 100644 --- a/contrib/Makefile +++ b/contrib/Makefile @@ -88,6 +88,12 @@ else ALWAYS_SUBDIRS += hstore_plpython jsonb_plpython ltree_plpython endif +ifeq ($(with_icu),yes) +SUBDIRS += url +else +ALWAYS_SUBDIRS += url +endif + # Missing: # start-scripts \ (does not have a makefile) diff --git a/contrib/meson.build b/contrib/meson.build index 159ff41555..ad4b21f49b 100644 --- a/contrib/meson.build +++ b/contrib/meson.build @@ -67,6 +67,7 @@ subdir('test_decoding') subdir('tsm_system_rows') subdir('tsm_system_time') subdir('unaccent') +subdir('url') subdir('uuid-ossp') subdir('vacuumlo') subdir('xml2') diff --git a/contrib/url/Makefile b/contrib/url/Makefile new file mode 100644 index 0000000000..8645042422 --- /dev/null +++ b/contrib/url/Makefile @@ -0,0 +1,37 @@ +# contrib/url/Makefile + +EXTENSION = url +MODULE_big = url +DATA = url--1.0.sql +REGRESS = url +PGFILEDESC = "url - Uniform Resource Locator data type" + +url_objs = lexbor/core/array.o \ + lexbor/core/array_obj.o \ + lexbor/core/bst.o \ + lexbor/core/conv.o \ + lexbor/core/diyfp.o \ + lexbor/core/dobject.o \ + lexbor/core/dtoa.o \ + lexbor/core/mem.o \ + lexbor/core/memory.o \ + lexbor/core/mraw.o \ + lexbor/core/plog.o \ + lexbor/core/str.o \ + lexbor/core/strtod.o \ + lexbor/url/url.o + +OBJS = url.o $(url_objs) + +SHLIB_LINK_INTERNAL = $(ICU_LIBS) + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +subdir = contrib/url +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/contrib/url/README b/contrib/url/README new file mode 100644 index 0000000000..48b50b12a9 --- /dev/null +++ b/contrib/url/README @@ -0,0 +1,137 @@ +URL type for PostgreSQL +--------------------------- + +To register the new SQL type: + + CREATE EXTENSION url; + +URL Functions + +Function: to_url() +Return Type: url +Description: Returns the value as url. +Example: to_url('https://example.com/'::text) +Result: "https://example.com/" + +Function: url_base() +Return Type: url +Description: Returns url on the basis of absolute and relative url. +Example: url_base('https://example.com/a/b/c/d/e/f'::url, '../../x/y/z'::text) +Result: "https://example.com/a/b/c/x/y/z" + +Getters + +Function: scheme() +Return Type: text +Description: Returns scheme from the url if exists, otherwise NULL. +Example: ('https://example.com/'::url).scheme +Result: "https" + +Function: username() +Return Type: text +Description: Returns username from the url if exists, otherwise NULL. +Example: ('https://root:qwerty@example.com/'::url).username +Result: "root" + +Function: password() +Return Type: text +Description: Returns password from the url if exists, otherwise NULL. +Example: ('https://root:qwerty@example.com/'::url).password +Result: "qwerty" + +Function: host() +Return Type: text +Description: Returns host from the url if exists, otherwise NULL. +Example: ('https://事例.com/'::url).host +Result: "xn--3kq3x.com" + +Function: host_unicode() +Return Type: text +Description: Returns host as unicode from the url if exists, otherwise NULL. +Example: ('https://事例.com/'::url).host_unicode +Result: "事例.com" + +Function: port() +Return Type: integer +Description: Returns port from the url if exists, otherwise NULL. +Example: ('https://example.com:8080/'::url).port +Result: 8080 + +Function: path() +Return Type: text +Description: Returns path from the url. +Example: ('https://example.com/path/to/home'::url).path +Result: "/path/to/home" + +Function: query() +Return Type: text +Description: Returns query from the url if exists, otherwise NULL. +Example: ('https://example.com?abc=xyz&1=2'::url).query +Result: "abc=xyz&1=2" + +Function: fragment() +Return Type: text +Description: Returns fragment from the url if exists, otherwise NULL. +Example: ('https://example.com#comments'::url).fragment +Result: "comments" + + +Setters + +Function: url_scheme_set(url, text) +Return Type: url +Description: Sets a new scheme for the url. +Example: url_scheme_set('https://example.com'::url, 'wss') +Result: "wss://example.com/" + +Function: url_username_set(url, text) +Return Type: url +Description: Sets a new username for the url. If the username is NULL or an empty value, it deletes it. +Example: url_username_set('https://root:qwerty@example.com'::url, 'guest') +Result: "https://guest:qwerty@example.com/" + +Function: url_password_set(url, text) +Return Type: url +Description: Sets a new password for the url. If the password is NULL or an empty value, it deletes it. +Example: url_password_set('https://root:qwerty@example.com'::url, '12345') +Result: "https://root:12345@example.com/" + +Function: url_host_set(url, text) +Return Type: url +Description: Sets a new host for the url. +Example: url_host_set('https://example.com'::url, 'postgresql.org') +Result: "https://postgresql.org/" + +Function: url_hostname_set(url, text) +Return Type: url +Description: Sets a new host for the url. +Example: url_hostname_set('https://example.com'::url, 'postgresql.org') +Result: "https://postgresql.org/" + +Function: url_port_set(url, text or integer) +Return Type: url +Description: Sets a new port for the url. If the port is NULL or an empty value, it deletes it. +Example: url_port_set('https://example.com:8080'::url, '80') +Result: "https://example.com:80/" + +Function: url_path_set(url, text) +Return Type: url +Description: Sets a new path for the url. If the path is NULL or an empty value, it deletes it. +Example: url_path_set('https://example.com/path/to/home'::url, '/a/b/c') +Result: "https://example.com/a/b/c" + +Function: url_query_set(url, text) +Return Type: url +Description: Sets a new query for the url. If the query is NULL or an empty value, it deletes it. +Example: url_query_set('https://example.com?abc=xyz'::url, '123=abc') +Result: "https://example.com/?123=abc" + +Function: url_fragment_set(url, text) +Return Type: url +Description: Sets a new fragment for the url. If the fragment is NULL or an empty value, it deletes it. +Example: url_fragment_set('https://example.com#comment'::url, 'position') +Result: "https://example.com/#position" + +NOTE: +If you pass NULL as url, NULL will be returned. + diff --git a/contrib/url/expected/url.out b/contrib/url/expected/url.out new file mode 100644 index 0000000000..8ddcb7581d --- /dev/null +++ b/contrib/url/expected/url.out @@ -0,0 +1,478 @@ +-- +-- Basic URL tests for the behavior of functions. +-- The tests for compliance with the specification are located separately. +-- +-- The tests are designed for a UTF-8 database. Skip otherwise. +SELECT getdatabaseencoding() NOT IN ('UTF8') + AS skip_test \gset +\if :skip_test + \quit +\endif +SELECT getdatabaseencoding(); -- label the results files + getdatabaseencoding +--------------------- + UTF8 +(1 row) + +CREATE EXTENSION url; +-- Getters +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).scheme; -- OK, https + scheme +-------- + https +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).username; -- OK, root + username +---------- + root +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).password; -- OK, qwerty + password +---------- + qwerty +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).host; -- OK, example.com + host +------------- + example.com +(1 row) + +select ('https://root:qwerty@εxαmπle.cθm:8080/path/to/home?abc=xyz#anchor'::url).host; -- OK, xn--xmle-0ldw4f.xn--cm-x9b + host +---------------------------- + xn--xmle-0ldw4f.xn--cm-x9b +(1 row) + +select ('https://root:qwerty@εxαmπle.cθm:8080/path/to/home?abc=xyz#anchor'::url).host_unicode; -- OK, εxαmπle.cθm + host_unicode +-------------- + εxαmπle.cθm +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).port; -- OK, 8080 + port +------ + 8080 +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).path; -- OK, /path/to/home + path +--------------- + /path/to/home +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).query; -- OK, abc=xyz + query +--------- + abc=xyz +(1 row) + +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).fragment; -- OK, anchor + fragment +---------- + anchor +(1 row) + +-- Setters +select url_scheme_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'wss'); -- OK + url_scheme_set +---------------------------------------------------------------- + wss://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor +(1 row) + +select url_username_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'guest'); -- OK + url_username_set +------------------------------------------------------------------- + https://guest:qwerty@example.com:8080/path/to/home?abc=xyz#anchor +(1 row) + +select url_password_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '12345'); -- OK + url_password_set +----------------------------------------------------------------- + https://root:12345@example.com:8080/path/to/home?abc=xyz#anchor +(1 row) + +select url_host_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'postgresql.org'); -- OK + url_host_set +--------------------------------------------------------------------- + https://root:qwerty@postgresql.org:8080/path/to/home?abc=xyz#anchor +(1 row) + +select url_port_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '80'); -- OK + url_port_set +---------------------------------------------------------------- + https://root:qwerty@example.com:80/path/to/home?abc=xyz#anchor +(1 row) + +select url_path_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '/docs/books/'); -- OK + url_path_set +----------------------------------------------------------------- + https://root:qwerty@example.com:8080/docs/books/?abc=xyz#anchor +(1 row) + +select url_query_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'xyz=abc'); -- OK + url_query_set +------------------------------------------------------------------ + https://root:qwerty@example.com:8080/path/to/home?xyz=abc#anchor +(1 row) + +select url_fragment_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'general_questions'); -- OK + url_fragment_set +----------------------------------------------------------------------------- + https://root:qwerty@example.com:8080/path/to/home?abc=xyz#general_questions +(1 row) + +-- Base +select url_base(NULL::url, NULL); -- OK, NULL + url_base +---------- + +(1 row) + +select url_base('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, NULL); -- OK + url_base +----------------------------------------------------------- + https://root:qwerty@example.com:8080/path/to/home?abc=xyz +(1 row) + +select url_base('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '/change/path'); -- OK + url_base +-------------------------------------------------- + https://root:qwerty@example.com:8080/change/path +(1 row) + +-- Unicode +select ('https://εxαmπle.cθm/'::url).host; -- OK, xn--xmle-0ldw4f.xn--cm-x9b + host +---------------------------- + xn--xmle-0ldw4f.xn--cm-x9b +(1 row) + +select ('https://εxαmπle.cθm/'::url).host_unicode; -- OK, εxαmπle.cθm + host_unicode +-------------- + εxαmπle.cθm +(1 row) + +-- Percent Encode +select ('https://βεst@example.com'::url).username; -- OK + username +---------------- + %CE%B2%CE%B5st +(1 row) + +select ('https://:παssφord@example.com'::url).password; -- OK + password +------------------------- + %CF%80%CE%B1ss%CF%86ord +(1 row) + +select ('https://example.com/pαth/to/hθmε'::url).path; -- OK + path +------------------------------ + /p%CE%B1th/to/h%CE%B8m%CE%B5 +(1 row) + +select ('https://xample.com/?αβγ=χψω'::url).query; -- OK + query +--------------------------------------- + %CE%B1%CE%B2%CE%B3=%CF%87%CF%88%CF%89 +(1 row) + +select ('https://xample.com/#αnchθrΩ'::url).fragment; -- OK + fragment +------------------------ + %CE%B1nch%CE%B8r%CE%A9 +(1 row) + +-- Getters Ok, Error +select ''::url; -- ERROR +ERROR: failed to parse the URL "" +LINE 1: select ''::url; + ^ +select NULL::url; -- OK + url +----- + +(1 row) + +select (NULL::url).scheme; -- OK + scheme +-------- + +(1 row) + +select ('file://path/to'::url).scheme; -- OK + scheme +-------- + file +(1 row) + +select ('/bad/url'::url).scheme; -- ERROR +ERROR: failed to parse the URL "/bad/url" +LINE 1: select ('/bad/url'::url).scheme; + ^ +select (NULL::url).username; -- OK + username +---------- + +(1 row) + +select ('https://example.com'::url).username; -- OK + username +---------- + +(1 row) + +select (NULL::url).password; -- OK + password +---------- + +(1 row) + +select ('https://example.com'::url).password; -- OK + password +---------- + +(1 row) + +select (NULL::url).host; -- OK + host +------ + +(1 row) + +select ('file://host/to'::url).host; -- OK + host +------ + host +(1 row) + +select ('file:/path/to'::url).host; -- OK + host +------ + +(1 row) + +select (NULL::url).host_unicode; -- OK + host_unicode +-------------- + +(1 row) + +select ('file://host/to'::url).host_unicode; -- OK + host_unicode +-------------- + host +(1 row) + +select ('file:/path/to'::url).host_unicode; -- OK + host_unicode +-------------- + +(1 row) + +select (NULL::url).port; -- OK + port +------ + +(1 row) + +select ('https://example.com'::url).port; -- OK + port +------ + +(1 row) + +select (NULL::url).path; -- OK + path +------ + +(1 row) + +select ('https://example.com'::url).path; -- OK + path +------ + / +(1 row) + +select ('file:/path/to'::url).path; -- OK + path +---------- + /path/to +(1 row) + +select (NULL::url).query; -- OK + query +------- + +(1 row) + +select ('https://example.com'::url).query; -- OK + query +------- + +(1 row) + +select (NULL::url).fragment; -- OK + fragment +---------- + +(1 row) + +select ('https://example.com'::url).fragment; -- OK + fragment +---------- + +(1 row) + +-- Setters Ok, Error +select url_scheme_set('https://example.com'::url, NULL); -- ERROR +ERROR: failed to parse "scheme" part "" of URL +select url_scheme_set('https://example.com'::url, ''); -- ERROR +ERROR: failed to parse "scheme" part "" of URL +select url_scheme_set('https://example.com'::url, '---+'); -- ERROR +ERROR: failed to parse "scheme" part "---+" of URL +select url_username_set('https://root:qwerty@example.com'::url, NULL); -- OK + url_username_set +------------------------------ + https://:qwerty@example.com/ +(1 row) + +select url_username_set('https://root:qwerty@example.com'::url, ''); -- OK + url_username_set +------------------------------ + https://:qwerty@example.com/ +(1 row) + +select url_username_set('https://root:qwerty@example.com'::url, 'αβγ'); -- OK + url_username_set +------------------------------------------------ + https://%CE%B1%CE%B2%CE%B3:qwerty@example.com/ +(1 row) + +select url_password_set('https://root:qwerty@example.com'::url, NULL); -- OK + url_password_set +--------------------------- + https://root@example.com/ +(1 row) + +select url_password_set('https://root:qwerty@example.com'::url, ''); -- OK + url_password_set +--------------------------- + https://root@example.com/ +(1 row) + +select url_password_set('https://root:qwerty@example.com'::url, 'αβγ'); -- OK + url_password_set +---------------------------------------------- + https://root:%CE%B1%CE%B2%CE%B3@example.com/ +(1 row) + +select url_port_set('https://example.com:8080'::url, NULL); -- OK + url_port_set +---------------------- + https://example.com/ +(1 row) + +select url_port_set('https://example.com:8080'::url, ''); -- OK + url_port_set +---------------------- + https://example.com/ +(1 row) + +select url_port_set('https://example.com:8080'::url, '80'); -- OK + url_port_set +------------------------- + https://example.com:80/ +(1 row) + +select url_port_set('https://example.com:8080'::url, '123456'); -- ERROR +ERROR: failed to parse "port" part "123456" of URL +select url_port_set('https://example.com:8080'::url, 80); -- OK + url_port_set +------------------------- + https://example.com:80/ +(1 row) + +select url_port_set('https://example.com:8080'::url, 123456); -- ERROR +ERROR: failed to parse "port" part "123456" of URL +select url_host_set('https://example.com'::url, NULL); -- ERROR +ERROR: failed to parse "host" part "" of URL +select url_host_set('https://example.com'::url, ''); -- ERROR +ERROR: failed to parse "host" part "" of URL +select url_host_set('https://example.com'::url, '123'); -- OK + url_host_set +-------------------- + https://0.0.0.123/ +(1 row) + +select url_host_set('https://example.com'::url, 'αβγ'); -- OK + url_host_set +-------------------- + https://xn--mxacd/ +(1 row) + +select url_path_set('https://example.com/path/to/home'::url, NULL); -- OK + url_path_set +---------------------- + https://example.com/ +(1 row) + +select url_path_set('https://example.com/path/to/home'::url, ''); -- OK + url_path_set +---------------------- + https://example.com/ +(1 row) + +select url_path_set('https://example.com/path/to/home'::url, '/'); -- OK + url_path_set +---------------------- + https://example.com/ +(1 row) + +select url_path_set('https://example.com/path/to/home'::url, 'αβγ'); -- OK + url_path_set +---------------------------------------- + https://example.com/%CE%B1%CE%B2%CE%B3 +(1 row) + +select url_query_set('https://example.com?abc=xyz'::url, NULL); -- OK + url_query_set +---------------------- + https://example.com/ +(1 row) + +select url_query_set('https://example.com?abc=xyz'::url, ''); -- OK + url_query_set +---------------------- + https://example.com/ +(1 row) + +select url_query_set('https://example.com?abc=xyz'::url, 'αβγ'); -- OK + url_query_set +----------------------------------------- + https://example.com/?%CE%B1%CE%B2%CE%B3 +(1 row) + +select url_fragment_set('https://example.com#anchor'::url, NULL); -- OK + url_fragment_set +---------------------- + https://example.com/ +(1 row) + +select url_fragment_set('https://example.com#anchor'::url, ''); -- OK + url_fragment_set +---------------------- + https://example.com/ +(1 row) + +select url_fragment_set('https://example.com#anchor'::url, 'αβγ'); -- OK + url_fragment_set +----------------------------------------- + https://example.com/#%CE%B1%CE%B2%CE%B3 +(1 row) + diff --git a/contrib/url/expected/url_1.out b/contrib/url/expected/url_1.out new file mode 100644 index 0000000000..59772488fd --- /dev/null +++ b/contrib/url/expected/url_1.out @@ -0,0 +1,9 @@ +-- +-- Basic URL tests for the behavior of functions. +-- The tests for compliance with the specification are located separately. +-- +-- The tests are designed for a UTF-8 database. Skip otherwise. +SELECT getdatabaseencoding() NOT IN ('UTF8') + AS skip_test \gset +\if :skip_test + \quit diff --git a/contrib/url/lexbor/core/array.c b/contrib/url/lexbor/core/array.c new file mode 100644 index 0000000000..2fe801fbe7 --- /dev/null +++ b/contrib/url/lexbor/core/array.c @@ -0,0 +1,210 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/array.h" + + +lexbor_array_t * +lexbor_array_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_array_t)); +} + +lxb_status_t +lexbor_array_init(lexbor_array_t *array, size_t size) +{ + if (array == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (size == 0) { + return LXB_STATUS_ERROR_TOO_SMALL_SIZE; + } + + array->length = 0; + array->size = size; + + array->list = lexbor_malloc(sizeof(void *) * size); + if (array->list == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + return LXB_STATUS_OK; +} + +void +lexbor_array_clean(lexbor_array_t *array) +{ + if (array != NULL) { + array->length = 0; + } +} + +lexbor_array_t * +lexbor_array_destroy(lexbor_array_t *array, bool self_destroy) +{ + if (array == NULL) + return NULL; + + if (array->list) { + array->length = 0; + array->size = 0; + array->list = lexbor_free(array->list); + } + + if (self_destroy) { + return lexbor_free(array); + } + + return array; +} + +void ** +lexbor_array_expand(lexbor_array_t *array, size_t up_to) +{ + void **list; + size_t new_size; + + if (array->length > (SIZE_MAX - up_to)) + return NULL; + + new_size = array->length + up_to; + list = lexbor_realloc(array->list, sizeof(void *) * new_size); + + if (list == NULL) + return NULL; + + array->list = list; + array->size = new_size; + + return list; +} + +lxb_status_t +lexbor_array_push(lexbor_array_t *array, void *value) +{ + if (array->length >= array->size) { + if ((lexbor_array_expand(array, 128) == NULL)) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + + array->list[ array->length ] = value; + array->length++; + + return LXB_STATUS_OK; +} + +void * +lexbor_array_pop(lexbor_array_t *array) +{ + if (array->length == 0) { + return NULL; + } + + array->length--; + return array->list[ array->length ]; +} + +lxb_status_t +lexbor_array_insert(lexbor_array_t *array, size_t idx, void *value) +{ + if (idx >= array->length) { + size_t up_to = (idx - array->length) + 1; + + if (idx >= array->size) { + if ((lexbor_array_expand(array, up_to) == NULL)) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + + memset(&array->list[array->length], 0, sizeof(void *) * up_to); + + array->list[ idx ] = value; + array->length += up_to; + + return LXB_STATUS_OK; + } + + if (array->length >= array->size) { + if ((lexbor_array_expand(array, 32) == NULL)) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + + memmove(&array->list[idx + 1], &array->list[idx], + sizeof(void *) * (array->length - idx)); + + array->list[ idx ] = value; + array->length++; + + return LXB_STATUS_OK; +} + +lxb_status_t +lexbor_array_set(lexbor_array_t *array, size_t idx, void *value) +{ + if (idx >= array->length) { + size_t up_to = (idx - array->length) + 1; + + if (idx >= array->size) { + if ((lexbor_array_expand(array, up_to) == NULL)) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + + memset(&array->list[array->length], 0, sizeof(void *) * up_to); + + array->length += up_to; + } + + array->list[idx] = value; + + return LXB_STATUS_OK; +} + +void +lexbor_array_delete(lexbor_array_t *array, size_t begin, size_t length) +{ + size_t end_len; + + if (begin >= array->length || length == 0) { + return; + } + + end_len = begin + length; + + if (end_len >= array->length) { + array->length = begin; + return; + } + + memmove(&array->list[begin], &array->list[end_len], + sizeof(void *) * (array->length - end_len)); + + array->length -= length; +} + +/* + * No inline functions. + */ +void * +lexbor_array_get_noi(lexbor_array_t *array, size_t idx) +{ + return lexbor_array_get(array, idx); +} + +size_t +lexbor_array_length_noi(lexbor_array_t *array) +{ + return lexbor_array_length(array); +} + +size_t +lexbor_array_size_noi(lexbor_array_t *array) +{ + return lexbor_array_size(array); +} diff --git a/contrib/url/lexbor/core/array.h b/contrib/url/lexbor/core/array.h new file mode 100644 index 0000000000..5dbdac7d1e --- /dev/null +++ b/contrib/url/lexbor/core/array.h @@ -0,0 +1,100 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_ARRAY_H +#define LEXBOR_ARRAY_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" + + +typedef struct { + void **list; + size_t size; + size_t length; +} +lexbor_array_t; + + +LXB_API lexbor_array_t * +lexbor_array_create(void); + +LXB_API lxb_status_t +lexbor_array_init(lexbor_array_t *array, size_t size); + +LXB_API void +lexbor_array_clean(lexbor_array_t *array); + +LXB_API lexbor_array_t * +lexbor_array_destroy(lexbor_array_t *array, bool self_destroy); + + +LXB_API void ** +lexbor_array_expand(lexbor_array_t *array, size_t up_to); + + +LXB_API lxb_status_t +lexbor_array_push(lexbor_array_t *array, void *value); + +LXB_API void * +lexbor_array_pop(lexbor_array_t *array); + +LXB_API lxb_status_t +lexbor_array_insert(lexbor_array_t *array, size_t idx, void *value); + +LXB_API lxb_status_t +lexbor_array_set(lexbor_array_t *array, size_t idx, void *value); + +LXB_API void +lexbor_array_delete(lexbor_array_t *array, size_t begin, size_t length); + + +/* + * Inline functions + */ +lxb_inline void * +lexbor_array_get(lexbor_array_t *array, size_t idx) +{ + if (idx >= array->length) { + return NULL; + } + + return array->list[idx]; +} + +lxb_inline size_t +lexbor_array_length(lexbor_array_t *array) +{ + return array->length; +} + +lxb_inline size_t +lexbor_array_size(lexbor_array_t *array) +{ + return array->size; +} + +/* + * No inline functions for ABI. + */ +LXB_API void * +lexbor_array_get_noi(lexbor_array_t *array, size_t idx); + +LXB_API size_t +lexbor_array_length_noi(lexbor_array_t *array); + +LXB_API size_t +lexbor_array_size_noi(lexbor_array_t *array); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_ARRAY_H */ diff --git a/contrib/url/lexbor/core/array_obj.c b/contrib/url/lexbor/core/array_obj.c new file mode 100644 index 0000000000..eb1fd452c3 --- /dev/null +++ b/contrib/url/lexbor/core/array_obj.c @@ -0,0 +1,218 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/array_obj.h" + + +lexbor_array_obj_t * +lexbor_array_obj_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_array_obj_t)); +} + +lxb_status_t +lexbor_array_obj_init(lexbor_array_obj_t *array, + size_t size, size_t struct_size) +{ + if (array == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (size == 0 || struct_size == 0) { + return LXB_STATUS_ERROR_TOO_SMALL_SIZE; + } + + array->length = 0; + array->size = size; + array->struct_size = struct_size; + + array->list = lexbor_malloc(sizeof(uint8_t *) + * (array->size * struct_size)); + if (array->list == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + return LXB_STATUS_OK; +} + +void +lexbor_array_obj_clean(lexbor_array_obj_t *array) +{ + if (array != NULL) { + array->length = 0; + } +} + +lexbor_array_obj_t * +lexbor_array_obj_destroy(lexbor_array_obj_t *array, bool self_destroy) +{ + if (array == NULL) + return NULL; + + if (array->list) { + array->length = 0; + array->size = 0; + array->list = lexbor_free(array->list); + } + + if (self_destroy) { + return lexbor_free(array); + } + + return array; +} + +uint8_t * +lexbor_array_obj_expand(lexbor_array_obj_t *array, size_t up_to) +{ + uint8_t *list; + size_t new_size; + + if (array->length > (SIZE_MAX - up_to)) { + return NULL; + } + + new_size = array->length + up_to; + + list = lexbor_realloc(array->list, sizeof(uint8_t *) + * (new_size * array->struct_size)); + if (list == NULL) { + return NULL; + } + + array->list = list; + array->size = new_size; + + return list; +} + +void * +lexbor_array_obj_push(lexbor_array_obj_t *array) +{ + void *entry; + + if (array->length >= array->size) + { + if ((lexbor_array_obj_expand(array, 128) == NULL)) { + return NULL; + } + } + + entry = array->list + (array->length * array->struct_size); + array->length++; + + memset(entry, 0, array->struct_size); + + return entry; +} + +void * +lexbor_array_obj_push_wo_cls(lexbor_array_obj_t *array) +{ + void *entry; + + if (array->length >= array->size) { + if ((lexbor_array_obj_expand(array, 128) == NULL)) { + return NULL; + } + } + + entry = array->list + (array->length * array->struct_size); + array->length++; + + return entry; +} + +void * +lexbor_array_obj_push_n(lexbor_array_obj_t *array, size_t count) +{ + void *entry; + + if ((array->length + count) > array->size) { + if ((lexbor_array_obj_expand(array, count + 128) == NULL)) { + return NULL; + } + } + + entry = array->list + (array->length * array->struct_size); + array->length += count; + + return entry; +} + +void * +lexbor_array_obj_pop(lexbor_array_obj_t *array) +{ + if (array->length == 0) { + return NULL; + } + + array->length--; + return array->list + (array->length * array->struct_size); +} + +void +lexbor_array_obj_delete(lexbor_array_obj_t *array, size_t begin, size_t length) +{ + size_t end_len; + + if (begin >= array->length || length == 0) { + return; + } + + end_len = begin + length; + + if (end_len >= array->length) { + array->length = begin; + return; + } + + memmove(&array->list[ begin * array->struct_size ], + &array->list[ end_len * array->struct_size ], + sizeof(uint8_t *) + * ((array->length - end_len) * array->struct_size)); + + array->length -= length; +} + +/* + * No inline functions. + */ +void +lexbor_array_obj_erase_noi(lexbor_array_obj_t *array) +{ + lexbor_array_obj_erase(array); +} + +void * +lexbor_array_obj_get_noi(lexbor_array_obj_t *array, size_t idx) +{ + return lexbor_array_obj_get(array, idx); +} + +size_t +lexbor_array_obj_length_noi(lexbor_array_obj_t *array) +{ + return lexbor_array_obj_length(array); +} + +size_t +lexbor_array_obj_size_noi(lexbor_array_obj_t *array) +{ + return lexbor_array_obj_size(array); +} + +size_t +lexbor_array_obj_struct_size_noi(lexbor_array_obj_t *array) +{ + return lexbor_array_obj_struct_size(array); +} + +void * +lexbor_array_obj_last_noi(lexbor_array_obj_t *array) +{ + return lexbor_array_obj_last(array); +} diff --git a/contrib/url/lexbor/core/array_obj.h b/contrib/url/lexbor/core/array_obj.h new file mode 100644 index 0000000000..de6202b221 --- /dev/null +++ b/contrib/url/lexbor/core/array_obj.h @@ -0,0 +1,134 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_ARRAY_OBJ_H +#define LEXBOR_ARRAY_OBJ_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" + + +typedef struct { + uint8_t *list; + size_t size; + size_t length; + size_t struct_size; +} +lexbor_array_obj_t; + + +LXB_API lexbor_array_obj_t * +lexbor_array_obj_create(void); + +LXB_API lxb_status_t +lexbor_array_obj_init(lexbor_array_obj_t *array, + size_t size, size_t struct_size); + +LXB_API void +lexbor_array_obj_clean(lexbor_array_obj_t *array); + +LXB_API lexbor_array_obj_t * +lexbor_array_obj_destroy(lexbor_array_obj_t *array, bool self_destroy); + + +LXB_API uint8_t * +lexbor_array_obj_expand(lexbor_array_obj_t *array, size_t up_to); + + +LXB_API void * +lexbor_array_obj_push(lexbor_array_obj_t *array); + +LXB_API void * +lexbor_array_obj_push_wo_cls(lexbor_array_obj_t *array); + +LXB_API void * +lexbor_array_obj_push_n(lexbor_array_obj_t *array, size_t count); + +LXB_API void * +lexbor_array_obj_pop(lexbor_array_obj_t *array); + +LXB_API void +lexbor_array_obj_delete(lexbor_array_obj_t *array, size_t begin, size_t length); + + +/* + * Inline functions + */ +lxb_inline void +lexbor_array_obj_erase(lexbor_array_obj_t *array) +{ + memset(array, 0, sizeof(lexbor_array_obj_t)); +} + +lxb_inline void * +lexbor_array_obj_get(const lexbor_array_obj_t *array, size_t idx) +{ + if (idx >= array->length) { + return NULL; + } + + return array->list + (idx * array->struct_size); +} + +lxb_inline size_t +lexbor_array_obj_length(lexbor_array_obj_t *array) +{ + return array->length; +} + +lxb_inline size_t +lexbor_array_obj_size(lexbor_array_obj_t *array) +{ + return array->size; +} + +lxb_inline size_t +lexbor_array_obj_struct_size(lexbor_array_obj_t *array) +{ + return array->struct_size; +} + +lxb_inline void * +lexbor_array_obj_last(lexbor_array_obj_t *array) +{ + if (array->length == 0) { + return NULL; + } + + return array->list + ((array->length - 1) * array->struct_size); +} + + +/* + * No inline functions for ABI. + */ +LXB_API void +lexbor_array_obj_erase_noi(lexbor_array_obj_t *array); + +LXB_API void * +lexbor_array_obj_get_noi(lexbor_array_obj_t *array, size_t idx); + +LXB_API size_t +lexbor_array_obj_length_noi(lexbor_array_obj_t *array); + +LXB_API size_t +lexbor_array_obj_size_noi(lexbor_array_obj_t *array); + +LXB_API size_t +lexbor_array_obj_struct_size_noi(lexbor_array_obj_t *array); + +LXB_API void * +lexbor_array_obj_last_noi(lexbor_array_obj_t *array); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_ARRAY_OBJ_H */ diff --git a/contrib/url/lexbor/core/base.h b/contrib/url/lexbor/core/base.h new file mode 100644 index 0000000000..3c14dcff4c --- /dev/null +++ b/contrib/url/lexbor/core/base.h @@ -0,0 +1,103 @@ +/* + * Copyright (C) 2018-2024 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_BASE_H +#define LEXBOR_BASE_H + +#ifdef __cplusplus +#define __STDC_LIMIT_MACROS +#define __STDC_CONSTANT_MACROS + +extern "C" { +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "lexbor/core/def.h" +#include "lexbor/core/types.h" +#include "lexbor/core/lexbor.h" + +#define LEXBOR_VERSION_MAJOR 1 +#define LEXBOR_VERSION_MINOR 8 +#define LEXBOR_VERSION_PATCH 0 + +#define LEXBOR_VERSION_STRING LEXBOR_STRINGIZE(LEXBOR_VERSION_MAJOR) "." \ + LEXBOR_STRINGIZE(LEXBOR_VERSION_MINOR) "." \ + LEXBOR_STRINGIZE(LEXBOR_VERSION_PATCH) + +#define lexbor_assert(val) + +#define lexbor_max(val1, val2) ((val1) > (val2) ? (val1) : (val2)) +#define lexbor_min(val1, val2) ((val1) < (val2) ? (val1) : (val2)) + + +/* + * Very important!!! + * + * for lexbor 0..00AFFF; LXB_STATUS_OK == 0x000000 + */ +typedef enum { + LXB_STATUS_OK = 0x0000, + LXB_STATUS_ERROR = 0x0001, + LXB_STATUS_ERROR_MEMORY_ALLOCATION, + LXB_STATUS_ERROR_OBJECT_IS_NULL, + LXB_STATUS_ERROR_SMALL_BUFFER, + LXB_STATUS_ERROR_INCOMPLETE_OBJECT, + LXB_STATUS_ERROR_NO_FREE_SLOT, + LXB_STATUS_ERROR_TOO_SMALL_SIZE, + LXB_STATUS_ERROR_NOT_EXISTS, + LXB_STATUS_ERROR_WRONG_ARGS, + LXB_STATUS_ERROR_WRONG_STAGE, + LXB_STATUS_ERROR_UNEXPECTED_RESULT, + LXB_STATUS_ERROR_UNEXPECTED_DATA, + LXB_STATUS_ERROR_OVERFLOW, + LXB_STATUS_CONTINUE, + LXB_STATUS_SMALL_BUFFER, + LXB_STATUS_ABORTED, + LXB_STATUS_STOPPED, + LXB_STATUS_NEXT, + LXB_STATUS_STOP, + LXB_STATUS_WARNING +} +lexbor_status_t; + +typedef enum { + LEXBOR_ACTION_OK = 0x00, + LEXBOR_ACTION_STOP = 0x01, + LEXBOR_ACTION_NEXT = 0x02 +} +lexbor_action_t; + + +typedef lxb_status_t +(*lexbor_serialize_cb_f)(const lxb_char_t *data, size_t len, void *ctx); + +typedef lxb_status_t +(*lexbor_serialize_cb_cp_f)(const lxb_codepoint_t *cps, size_t len, void *ctx); + + +typedef struct { + lexbor_serialize_cb_f cb; + void *ctx; + + intptr_t opt; + size_t count; +} +lexbor_serialize_ctx_t; + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_BASE_H */ + diff --git a/contrib/url/lexbor/core/bst.c b/contrib/url/lexbor/core/bst.c new file mode 100644 index 0000000000..60e79cfb6e --- /dev/null +++ b/contrib/url/lexbor/core/bst.c @@ -0,0 +1,471 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/bst.h" +#include "lexbor/core/conv.h" + + +lexbor_bst_t * +lexbor_bst_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_bst_t)); +} + +lxb_status_t +lexbor_bst_init(lexbor_bst_t *bst, size_t size) +{ + lxb_status_t status; + + if (bst == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (size == 0) { + return LXB_STATUS_ERROR_WRONG_ARGS; + } + + bst->dobject = lexbor_dobject_create(); + status = lexbor_dobject_init(bst->dobject, size, + sizeof(lexbor_bst_entry_t)); + if (status != LXB_STATUS_OK) { + return status; + } + + bst->root = 0; + bst->tree_length = 0; + + return LXB_STATUS_OK; +} + +void +lexbor_bst_clean(lexbor_bst_t *bst) +{ + if (bst != NULL) { + lexbor_dobject_clean(bst->dobject); + + bst->root = 0; + bst->tree_length = 0; + } +} + +lexbor_bst_t * +lexbor_bst_destroy(lexbor_bst_t *bst, bool self_destroy) +{ + if (bst == NULL) { + return NULL; + } + + bst->dobject = lexbor_dobject_destroy(bst->dobject, true); + + if (self_destroy) { + return lexbor_free(bst); + } + + return bst; +} + +lexbor_bst_entry_t * +lexbor_bst_entry_make(lexbor_bst_t *bst, size_t size) +{ + lexbor_bst_entry_t *new_entry = lexbor_dobject_calloc(bst->dobject); + if (new_entry == NULL) { + return NULL; + } + + new_entry->size = size; + + bst->tree_length++; + + return new_entry; +} + +lexbor_bst_entry_t * +lexbor_bst_insert(lexbor_bst_t *bst, lexbor_bst_entry_t **scope, + size_t size, void *value) +{ + lexbor_bst_entry_t *new_entry, *entry; + + new_entry = lexbor_dobject_calloc(bst->dobject); + if (new_entry == NULL) { + return NULL; + } + + new_entry->size = size; + new_entry->value = value; + + bst->tree_length++; + + if (*scope == NULL) { + *scope = new_entry; + return new_entry; + } + + entry = *scope; + + while (entry != NULL) { + if (size == entry->size) { + if (entry->next) { + new_entry->next = entry->next; + } + + entry->next = new_entry; + new_entry->parent = entry->parent; + + return new_entry; + } + else if (size > entry->size) { + if (entry->right == NULL) { + entry->right = new_entry; + new_entry->parent = entry; + + return new_entry; + } + + entry = entry->right; + } + else { + if (entry->left == NULL) { + entry->left = new_entry; + new_entry->parent = entry; + + return new_entry; + } + + entry = entry->left; + } + } + + return NULL; +} + +lexbor_bst_entry_t * +lexbor_bst_insert_not_exists(lexbor_bst_t *bst, lexbor_bst_entry_t **scope, + size_t size) +{ + lexbor_bst_entry_t *entry; + + if (*scope == NULL) { + *scope = lexbor_bst_entry_make(bst, size); + + return *scope; + } + + entry = *scope; + + while (entry != NULL) { + if (size == entry->size) { + return entry; + } + else if (size > entry->size) { + if (entry->right == NULL) { + entry->right = lexbor_bst_entry_make(bst, size); + entry->right->parent = entry; + + return entry->right; + } + + entry = entry->right; + } + else { + if (entry->left == NULL) { + entry->left = lexbor_bst_entry_make(bst, size); + entry->left->parent = entry; + + return entry->left; + } + + entry = entry->left; + } + } + + return NULL; +} + +lexbor_bst_entry_t * +lexbor_bst_search(lexbor_bst_t *bst, lexbor_bst_entry_t *scope, size_t size) +{ + while (scope != NULL) { + if (scope->size == size) { + return scope; + } + else if (size > scope->size) { + scope = scope->right; + } + else { + scope = scope->left; + } + } + + return NULL; +} + +lexbor_bst_entry_t * +lexbor_bst_search_close(lexbor_bst_t *bst, lexbor_bst_entry_t *scope, + size_t size) +{ + lexbor_bst_entry_t *max = NULL; + + while (scope != NULL) { + if (scope->size == size) { + return scope; + } + else if (size > scope->size) { + scope = scope->right; + } + else { + max = scope; + scope = scope->left; + } + } + + return max; +} + +void * +lexbor_bst_remove(lexbor_bst_t *bst, lexbor_bst_entry_t **scope, size_t size) +{ + lexbor_bst_entry_t *entry = *scope; + + while (entry != NULL) { + if (entry->size == size) { + return lexbor_bst_remove_by_pointer(bst, entry, scope); + } + else if (size > entry->size) { + entry = entry->right; + } + else { + entry = entry->left; + } + } + + return NULL; +} + +void * +lexbor_bst_remove_close(lexbor_bst_t *bst, lexbor_bst_entry_t **scope, + size_t size, size_t *found_size) +{ + lexbor_bst_entry_t *entry = *scope; + lexbor_bst_entry_t *max = NULL; + + while (entry != NULL) { + if (entry->size == size) { + if (found_size) { + *found_size = entry->size; + } + + return lexbor_bst_remove_by_pointer(bst, entry, scope); + } + else if (size > entry->size) { + entry = entry->right; + } + else { + max = entry; + entry = entry->left; + } + } + + if (max != NULL) { + if (found_size != NULL) { + *found_size = max->size; + } + + return lexbor_bst_remove_by_pointer(bst, max, scope); + } + + if (found_size != NULL) { + *found_size = 0; + } + + return NULL; +} + +void * +lexbor_bst_remove_by_pointer(lexbor_bst_t *bst, lexbor_bst_entry_t *entry, + lexbor_bst_entry_t **root) +{ + void *value; + lexbor_bst_entry_t *next, *right, *left; + + bst->tree_length--; + + if (entry->next != NULL) { + next = entry->next; + entry->next = entry->next->next; + + value = next->value; + + lexbor_dobject_free(bst->dobject, next); + + return value; + } + + value = entry->value; + + if (entry->left == NULL && entry->right == NULL) { + if (entry->parent != NULL) { + if (entry->parent->left == entry) entry->parent->left = NULL; + if (entry->parent->right == entry) entry->parent->right = NULL; + } + else { + *root = NULL; + } + + lexbor_dobject_free(bst->dobject, entry); + } + else if (entry->left == NULL) { + if (entry->parent == NULL) { + entry->right->parent = NULL; + + *root = entry->right; + + lexbor_dobject_free(bst->dobject, entry); + + entry = *root; + } + else { + right = entry->right; + right->parent = entry->parent; + + memcpy(entry, right, sizeof(lexbor_bst_entry_t)); + + lexbor_dobject_free(bst->dobject, right); + } + + if (entry->right != NULL) { + entry->right->parent = entry; + } + + if (entry->left != NULL) { + entry->left->parent = entry; + } + } + else if (entry->right == NULL) { + if (entry->parent == NULL) { + entry->left->parent = NULL; + + *root = entry->left; + + lexbor_dobject_free(bst->dobject, entry); + + entry = *root; + } + else { + left = entry->left; + left->parent = entry->parent; + + memcpy(entry, left, sizeof(lexbor_bst_entry_t)); + + lexbor_dobject_free(bst->dobject, left); + } + + if (entry->right != NULL) { + entry->right->parent = entry; + } + + if (entry->left != NULL) { + entry->left->parent = entry; + } + } + else { + left = entry->right; + + while (left->left != NULL) { + left = left->left; + } + + /* Swap */ + entry->size = left->size; + entry->next = left->next; + entry->value = left->value; + + /* Change parrent */ + if (entry->right == left) { + entry->right = left->right; + + if (entry->right != NULL) { + left->right->parent = entry; + } + } + else { + left->parent->left = left->right; + + if (left->right != NULL) { + left->right->parent = left->parent; + } + } + + lexbor_dobject_free(bst->dobject, left); + } + + return value; +} + +void +lexbor_bst_serialize(lexbor_bst_t *bst, lexbor_callback_f callback, void *ctx) +{ + lexbor_bst_serialize_entry(bst->root, callback, ctx, 0); +} + +void +lexbor_bst_serialize_entry(lexbor_bst_entry_t *entry, + lexbor_callback_f callback, void *ctx, size_t tabs) +{ + size_t len; + lxb_char_t buff[1024]; + + if (entry == NULL) { + return; + } + + /* Left */ + for (size_t i = 0; i < tabs; i++) { + callback((lxb_char_t *) "\t", 1, ctx); + } + callback((lxb_char_t *) "left) { + len = lexbor_conv_int64_to_data((int64_t) entry->left->size, + buff, sizeof(buff)); + callback(buff, len, ctx); + + callback((lxb_char_t *) ">\n", 2, ctx); + lexbor_bst_serialize_entry(entry->left, callback, ctx, (tabs + 1)); + + for (size_t i = 0; i < tabs; i++) { + callback((lxb_char_t *) "\t", 1, ctx); + } + } + else { + callback((lxb_char_t *) "NULL>", 5, ctx); + } + + callback((lxb_char_t *) "\n", 8, ctx); + + /* Right */ + for (size_t i = 0; i < tabs; i++) { + callback((lxb_char_t *) "\t", 1, ctx); + } + callback((lxb_char_t *) "right) { + len = lexbor_conv_int64_to_data((int64_t) entry->right->size, + buff, sizeof(buff)); + callback(buff, len, ctx); + + callback((lxb_char_t *) ">\n", 2, ctx); + lexbor_bst_serialize_entry(entry->right, callback, ctx, (tabs + 1)); + + for (size_t i = 0; i < tabs; i++) { + callback((lxb_char_t *) "\t", 1, ctx); + } + } + else { + callback((lxb_char_t *) "NULL>", 5, ctx); + } + + callback((lxb_char_t *) "\n", 9, ctx); +} diff --git a/contrib/url/lexbor/core/bst.h b/contrib/url/lexbor/core/bst.h new file mode 100644 index 0000000000..3d7bf88c38 --- /dev/null +++ b/contrib/url/lexbor/core/bst.h @@ -0,0 +1,108 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_BST_H +#define LEXBOR_BST_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "lexbor/core/base.h" +#include "lexbor/core/dobject.h" + + +#define lexbor_bst_root(bst) (bst)->root +#define lexbor_bst_root_ref(bst) &((bst)->root) + + +typedef struct lexbor_bst_entry lexbor_bst_entry_t; +typedef struct lexbor_bst lexbor_bst_t; + +typedef bool (*lexbor_bst_entry_f)(lexbor_bst_t *bst, + lexbor_bst_entry_t *entry, void *ctx); + +struct lexbor_bst_entry { + void *value; + + lexbor_bst_entry_t *right; + lexbor_bst_entry_t *left; + lexbor_bst_entry_t *next; + lexbor_bst_entry_t *parent; + + size_t size; +}; + +struct lexbor_bst { + lexbor_dobject_t *dobject; + lexbor_bst_entry_t *root; + + size_t tree_length; +}; + + +LXB_API lexbor_bst_t * +lexbor_bst_create(void); + +LXB_API lxb_status_t +lexbor_bst_init(lexbor_bst_t *bst, size_t size); + +LXB_API void +lexbor_bst_clean(lexbor_bst_t *bst); + +LXB_API lexbor_bst_t * +lexbor_bst_destroy(lexbor_bst_t *bst, bool self_destroy); + +LXB_API lexbor_bst_entry_t * +lexbor_bst_entry_make(lexbor_bst_t *bst, size_t size); + +LXB_API lexbor_bst_entry_t * +lexbor_bst_insert(lexbor_bst_t *bst, lexbor_bst_entry_t **scope, + size_t size, void *value); + +LXB_API lexbor_bst_entry_t * +lexbor_bst_insert_not_exists(lexbor_bst_t *bst, lexbor_bst_entry_t **scope, + size_t size); + + +LXB_API lexbor_bst_entry_t * +lexbor_bst_search(lexbor_bst_t *bst, lexbor_bst_entry_t *scope, size_t size); + +LXB_API lexbor_bst_entry_t * +lexbor_bst_search_close(lexbor_bst_t *bst, lexbor_bst_entry_t *scope, + size_t size); + + +LXB_API void * +lexbor_bst_remove(lexbor_bst_t *bst, lexbor_bst_entry_t **root, size_t size); + +LXB_API void * +lexbor_bst_remove_close(lexbor_bst_t *bst, lexbor_bst_entry_t **root, + size_t size, size_t *found_size); + +LXB_API void * +lexbor_bst_remove_by_pointer(lexbor_bst_t *bst, lexbor_bst_entry_t *entry, + lexbor_bst_entry_t **root); + + +LXB_API void +lexbor_bst_serialize(lexbor_bst_t *bst, lexbor_callback_f callback, void *ctx); + +LXB_API void +lexbor_bst_serialize_entry(lexbor_bst_entry_t *entry, + lexbor_callback_f callback, void *ctx, size_t tabs); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_BST_H */ + + + diff --git a/contrib/url/lexbor/core/conv.c b/contrib/url/lexbor/core/conv.c new file mode 100644 index 0000000000..5cc17c872f --- /dev/null +++ b/contrib/url/lexbor/core/conv.c @@ -0,0 +1,346 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include +#include + +#include "lexbor/core/conv.h" +#include "lexbor/core/dtoa.h" +#include "lexbor/core/strtod.h" + + +size_t +lexbor_conv_float_to_data(double num, lxb_char_t *buf, size_t len) +{ + return lexbor_dtoa(num, buf, len); +} + +size_t +lexbor_conv_long_to_data(long num, lxb_char_t *buf, size_t len) +{ + return lexbor_conv_int64_to_data((int64_t) num, buf, len); +} + +size_t +lexbor_conv_int64_to_data(int64_t num, lxb_char_t *buf, size_t len) +{ + int64_t tmp; + size_t have_minus, i, length; + + static const lxb_char_t *digits = (const lxb_char_t *) "0123456789"; + + if (num != 0) { + tmp = num; + length = 0; + have_minus = 0; + + if (num < 0) { + length = 1; + num = -num; + have_minus = 1; + } + + while (tmp != 0) { + length += 1; + tmp /= 10; + } + + /* length += (size_t) floor(log10(labs((long) num))) + 1; */ + } + else { + if (len > 0) { + buf[0] = '0'; + return 1; + } + + return 0; + } + + if (len < length) { + i = (length + have_minus) - len; + + while (i != have_minus) { + i -= 1; + num /= 10; + } + + length = len; + } + + if (have_minus) { + buf[0] = '-'; + } + + i = length; + buf[length] = '\0'; + + while (i != have_minus) { + i -= 1; + buf[i] = digits[ num % 10 ]; + num /= 10; + } + + return length; +} + +double +lexbor_conv_data_to_double(const lxb_char_t **start, size_t len) +{ + int exponent, exp, insignf; + lxb_char_t c, *pos; + bool minus, ex_minus; + double num; + const lxb_char_t *e, *p, *last, *end; + lxb_char_t data[128]; + + end = *start + len; + + exponent = 0; + insignf = 0; + + pos = data; + last = data + sizeof(data); + + minus = false; + + switch (**start) { + case '-': + minus = true; + /* fall through */ + case '+': + (*start)++; + /* fall through */ + default: + break; + } + + for (p = *start; p < end; p++) { + /* Values less than '0' become >= 208. */ + c = *p - '0'; + + if (c > 9) { + break; + } + + if (pos < last) { + *pos++ = *p; + } + else { + insignf++; + } + } + + /* Do not emit a '.', but adjust the exponent instead. */ + if (p < end && *p == '.') { + + for (p++; p < end; p++) { + /* Values less than '0' become >= 208. */ + c = *p - '0'; + + if (c > 9) { + break; + } + + if (pos < last) { + *pos++ = *p; + exponent--; + } + else { + /* Ignore insignificant digits in the fractional part. */ + } + } + } + + e = p + 1; + + if (e < end && (*p == 'e' || *p == 'E')) { + ex_minus = 0; + + if (e + 1 < end) { + if (*e == '-') { + e++; + ex_minus = 1; + } + else if (*e == '+') { + e++; + } + } + + /* Values less than '0' become >= 208. */ + c = *e - '0'; + + if (c <= 9) { + exp = c; + + for (p = e + 1; p < end; p++) { + /* Values less than '0' become >= 208. */ + c = *p - '0'; + + if (c > 9) { + break; + } + + exp = exp * 10 + c; + } + + exponent += ex_minus ? -exp : exp; + } + } + + *start = p; + + exponent += insignf; + + num = lexbor_strtod_internal(data, pos - data, exponent); + + if (minus) { + num = -num; + } + + return num; +} + +unsigned long +lexbor_conv_data_to_ulong(const lxb_char_t **data, size_t length) +{ + const lxb_char_t *p = *data; + const lxb_char_t *end = p + length; + unsigned long last_number = 0, number = 0; + + for (; p < end; p++) { + if (*p < '0' || *p > '9') { + goto done; + } + + number = (*p - '0') + number * 10; + + if (last_number > number) { + *data = p - 1; + return last_number; + } + + last_number = number; + } + +done: + + *data = p; + + return number; +} + +long +lexbor_conv_data_to_long(const lxb_char_t **data, size_t length) +{ + bool minus; + const lxb_char_t *p; + const lxb_char_t *end; + unsigned long n = 0, number = 0; + + minus = false; + p = *data; + end = p + length; + + switch (*p) { + case '-': + minus = true; + /* fall through */ + case '+': + p++; + /* fall through */ + default: + break; + } + + for (; p < end; p++) { + if (*p < '0' || *p > '9') { + break; + } + + n = (*p - '0') + number * 10; + + if (n > LONG_MAX) { + p -= 1; + break; + } + + number = n; + } + + *data = p; + + return (minus) ? -number : number; +} + +unsigned +lexbor_conv_data_to_uint(const lxb_char_t **data, size_t length) +{ + const lxb_char_t *p = *data; + const lxb_char_t *end = p + length; + unsigned last_number = 0, number = 0; + + for (; p < end; p++) { + if (*p < '0' || *p > '9') { + goto done; + } + + number = (*p - '0') + number * 10; + + if (last_number > number) { + *data = p - 1; + return last_number; + } + + last_number = number; + } + +done: + + *data = p; + + return number; +} + +size_t +lexbor_conv_dec_to_hex(uint32_t number, lxb_char_t *out, size_t length) +{ + lxb_char_t c; + size_t len; + uint32_t tmp; + + static const lxb_char_t map_str[] = "0123456789abcdef"; + + if(number != 0) { + tmp = number; + len = 0; + + while (tmp != 0) { + len += 1; + tmp /= 16; + } + + /* len = (size_t) floor(log10(labs((long) number))) + 1; */ + } + else { + if (length > 0) { + out[0] = '0'; + return 1; + } + + return 0; + } + + length = len - 1; + + while (number != 0) { + c = number % 16; + number = number / 16; + + out[ length-- ] = map_str[c]; + } + + return len; +} diff --git a/contrib/url/lexbor/core/conv.h b/contrib/url/lexbor/core/conv.h new file mode 100644 index 0000000000..f902cd5964 --- /dev/null +++ b/contrib/url/lexbor/core/conv.h @@ -0,0 +1,61 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_CONV_H +#define LEXBOR_CONV_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include "lexbor/core/base.h" + + +LXB_API size_t +lexbor_conv_float_to_data(double num, lxb_char_t *buf, size_t len); + +LXB_API size_t +lexbor_conv_long_to_data(long num, lxb_char_t *buf, size_t len); + +LXB_API size_t +lexbor_conv_int64_to_data(int64_t num, lxb_char_t *buf, size_t len); + +LXB_API double +lexbor_conv_data_to_double(const lxb_char_t **start, size_t len); + +LXB_API unsigned long +lexbor_conv_data_to_ulong(const lxb_char_t **data, size_t length); + +LXB_API long +lexbor_conv_data_to_long(const lxb_char_t **data, size_t length); + +LXB_API unsigned +lexbor_conv_data_to_uint(const lxb_char_t **data, size_t length); + +LXB_API size_t +lexbor_conv_dec_to_hex(uint32_t number, lxb_char_t *out, size_t length); + +lxb_inline long +lexbor_conv_double_to_long(double number) +{ + if (number > (double) LONG_MAX) { + return LONG_MAX; + } + + if (number < (double) LONG_MIN) { + return -LONG_MAX; + } + + return (long) number; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_CONV_H */ diff --git a/contrib/url/lexbor/core/def.h b/contrib/url/lexbor/core/def.h new file mode 100644 index 0000000000..5a48f59e99 --- /dev/null +++ b/contrib/url/lexbor/core/def.h @@ -0,0 +1,57 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_DEF_H +#define LEXBOR_DEF_H + +#define LEXBOR_STRINGIZE_HELPER(x) #x +#define LEXBOR_STRINGIZE(x) LEXBOR_STRINGIZE_HELPER(x) + +/* Format */ +#ifdef _WIN32 + #define LEXBOR_FORMAT_Z "%Iu" +#else + #define LEXBOR_FORMAT_Z "%zu" +#endif + +/* Deprecated */ +#ifdef _MSC_VER + #define LXB_DEPRECATED(func) __declspec(deprecated) func +#elif defined(__GNUC__) || defined(__INTEL_COMPILER) + #define LXB_DEPRECATED(func) func __attribute__((deprecated)) +#else + #define LXB_DEPRECATED(func) func +#endif + +/* Debug */ +//#define LEXBOR_DEBUG(...) do {} while (0) +//#define LEXBOR_DEBUG_ERROR(...) do {} while (0) + +#define LEXBOR_MEM_ALIGN_STEP sizeof(void *) + +#ifndef LEXBOR_STATIC + #ifdef _WIN32 + #ifdef LEXBOR_BUILDING + #define LXB_API __declspec(dllexport) + #else + #define LXB_API __declspec(dllimport) + #endif + #elif (defined(__SUNPRO_C) || defined(__SUNPRO_CC)) + #define LXB_API __global + #else + #if (defined(__GNUC__) && __GNUC__ >= 4) || defined(__INTEL_COMPILER) + #define LXB_API __attribute__ ((visibility("default"))) + #else + #define LXB_API + #endif + #endif +#else + #define LXB_API +#endif + +#define LXB_EXTERN extern LXB_API + +#endif /* LEXBOR_DEF_H */ diff --git a/contrib/url/lexbor/core/diyfp.c b/contrib/url/lexbor/core/diyfp.c new file mode 100644 index 0000000000..62020bbc64 --- /dev/null +++ b/contrib/url/lexbor/core/diyfp.c @@ -0,0 +1,153 @@ +/* + * Copyright (C) Alexander Borisov + * + * Based on nxt_diyfp.c from NGINX NJS project + * + * Copyright (C) Dmitry Volyntsev + * Copyright (C) NGINX, Inc. + * + * An internal diy_fp implementation. + * For details, see Loitsch, Florian. "Printing floating-point numbers quickly + * and accurately with integers." ACM Sigplan Notices 45.6 (2010): 233-243. + */ + +#include "lexbor/core/diyfp.h" + + +typedef struct { + uint64_t significand; + int16_t bin_exp; + int16_t dec_exp; +} +lexbor_diyfp_cpe_t; + + +static const lexbor_diyfp_cpe_t lexbor_cached_powers[] = { + { lexbor_uint64_hl(0xfa8fd5a0, 0x081c0288), -1220, -348 }, + { lexbor_uint64_hl(0xbaaee17f, 0xa23ebf76), -1193, -340 }, + { lexbor_uint64_hl(0x8b16fb20, 0x3055ac76), -1166, -332 }, + { lexbor_uint64_hl(0xcf42894a, 0x5dce35ea), -1140, -324 }, + { lexbor_uint64_hl(0x9a6bb0aa, 0x55653b2d), -1113, -316 }, + { lexbor_uint64_hl(0xe61acf03, 0x3d1a45df), -1087, -308 }, + { lexbor_uint64_hl(0xab70fe17, 0xc79ac6ca), -1060, -300 }, + { lexbor_uint64_hl(0xff77b1fc, 0xbebcdc4f), -1034, -292 }, + { lexbor_uint64_hl(0xbe5691ef, 0x416bd60c), -1007, -284 }, + { lexbor_uint64_hl(0x8dd01fad, 0x907ffc3c), -980, -276 }, + { lexbor_uint64_hl(0xd3515c28, 0x31559a83), -954, -268 }, + { lexbor_uint64_hl(0x9d71ac8f, 0xada6c9b5), -927, -260 }, + { lexbor_uint64_hl(0xea9c2277, 0x23ee8bcb), -901, -252 }, + { lexbor_uint64_hl(0xaecc4991, 0x4078536d), -874, -244 }, + { lexbor_uint64_hl(0x823c1279, 0x5db6ce57), -847, -236 }, + { lexbor_uint64_hl(0xc2109436, 0x4dfb5637), -821, -228 }, + { lexbor_uint64_hl(0x9096ea6f, 0x3848984f), -794, -220 }, + { lexbor_uint64_hl(0xd77485cb, 0x25823ac7), -768, -212 }, + { lexbor_uint64_hl(0xa086cfcd, 0x97bf97f4), -741, -204 }, + { lexbor_uint64_hl(0xef340a98, 0x172aace5), -715, -196 }, + { lexbor_uint64_hl(0xb23867fb, 0x2a35b28e), -688, -188 }, + { lexbor_uint64_hl(0x84c8d4df, 0xd2c63f3b), -661, -180 }, + { lexbor_uint64_hl(0xc5dd4427, 0x1ad3cdba), -635, -172 }, + { lexbor_uint64_hl(0x936b9fce, 0xbb25c996), -608, -164 }, + { lexbor_uint64_hl(0xdbac6c24, 0x7d62a584), -582, -156 }, + { lexbor_uint64_hl(0xa3ab6658, 0x0d5fdaf6), -555, -148 }, + { lexbor_uint64_hl(0xf3e2f893, 0xdec3f126), -529, -140 }, + { lexbor_uint64_hl(0xb5b5ada8, 0xaaff80b8), -502, -132 }, + { lexbor_uint64_hl(0x87625f05, 0x6c7c4a8b), -475, -124 }, + { lexbor_uint64_hl(0xc9bcff60, 0x34c13053), -449, -116 }, + { lexbor_uint64_hl(0x964e858c, 0x91ba2655), -422, -108 }, + { lexbor_uint64_hl(0xdff97724, 0x70297ebd), -396, -100 }, + { lexbor_uint64_hl(0xa6dfbd9f, 0xb8e5b88f), -369, -92 }, + { lexbor_uint64_hl(0xf8a95fcf, 0x88747d94), -343, -84 }, + { lexbor_uint64_hl(0xb9447093, 0x8fa89bcf), -316, -76 }, + { lexbor_uint64_hl(0x8a08f0f8, 0xbf0f156b), -289, -68 }, + { lexbor_uint64_hl(0xcdb02555, 0x653131b6), -263, -60 }, + { lexbor_uint64_hl(0x993fe2c6, 0xd07b7fac), -236, -52 }, + { lexbor_uint64_hl(0xe45c10c4, 0x2a2b3b06), -210, -44 }, + { lexbor_uint64_hl(0xaa242499, 0x697392d3), -183, -36 }, + { lexbor_uint64_hl(0xfd87b5f2, 0x8300ca0e), -157, -28 }, + { lexbor_uint64_hl(0xbce50864, 0x92111aeb), -130, -20 }, + { lexbor_uint64_hl(0x8cbccc09, 0x6f5088cc), -103, -12 }, + { lexbor_uint64_hl(0xd1b71758, 0xe219652c), -77, -4 }, + { lexbor_uint64_hl(0x9c400000, 0x00000000), -50, 4 }, + { lexbor_uint64_hl(0xe8d4a510, 0x00000000), -24, 12 }, + { lexbor_uint64_hl(0xad78ebc5, 0xac620000), 3, 20 }, + { lexbor_uint64_hl(0x813f3978, 0xf8940984), 30, 28 }, + { lexbor_uint64_hl(0xc097ce7b, 0xc90715b3), 56, 36 }, + { lexbor_uint64_hl(0x8f7e32ce, 0x7bea5c70), 83, 44 }, + { lexbor_uint64_hl(0xd5d238a4, 0xabe98068), 109, 52 }, + { lexbor_uint64_hl(0x9f4f2726, 0x179a2245), 136, 60 }, + { lexbor_uint64_hl(0xed63a231, 0xd4c4fb27), 162, 68 }, + { lexbor_uint64_hl(0xb0de6538, 0x8cc8ada8), 189, 76 }, + { lexbor_uint64_hl(0x83c7088e, 0x1aab65db), 216, 84 }, + { lexbor_uint64_hl(0xc45d1df9, 0x42711d9a), 242, 92 }, + { lexbor_uint64_hl(0x924d692c, 0xa61be758), 269, 100 }, + { lexbor_uint64_hl(0xda01ee64, 0x1a708dea), 295, 108 }, + { lexbor_uint64_hl(0xa26da399, 0x9aef774a), 322, 116 }, + { lexbor_uint64_hl(0xf209787b, 0xb47d6b85), 348, 124 }, + { lexbor_uint64_hl(0xb454e4a1, 0x79dd1877), 375, 132 }, + { lexbor_uint64_hl(0x865b8692, 0x5b9bc5c2), 402, 140 }, + { lexbor_uint64_hl(0xc83553c5, 0xc8965d3d), 428, 148 }, + { lexbor_uint64_hl(0x952ab45c, 0xfa97a0b3), 455, 156 }, + { lexbor_uint64_hl(0xde469fbd, 0x99a05fe3), 481, 164 }, + { lexbor_uint64_hl(0xa59bc234, 0xdb398c25), 508, 172 }, + { lexbor_uint64_hl(0xf6c69a72, 0xa3989f5c), 534, 180 }, + { lexbor_uint64_hl(0xb7dcbf53, 0x54e9bece), 561, 188 }, + { lexbor_uint64_hl(0x88fcf317, 0xf22241e2), 588, 196 }, + { lexbor_uint64_hl(0xcc20ce9b, 0xd35c78a5), 614, 204 }, + { lexbor_uint64_hl(0x98165af3, 0x7b2153df), 641, 212 }, + { lexbor_uint64_hl(0xe2a0b5dc, 0x971f303a), 667, 220 }, + { lexbor_uint64_hl(0xa8d9d153, 0x5ce3b396), 694, 228 }, + { lexbor_uint64_hl(0xfb9b7cd9, 0xa4a7443c), 720, 236 }, + { lexbor_uint64_hl(0xbb764c4c, 0xa7a44410), 747, 244 }, + { lexbor_uint64_hl(0x8bab8eef, 0xb6409c1a), 774, 252 }, + { lexbor_uint64_hl(0xd01fef10, 0xa657842c), 800, 260 }, + { lexbor_uint64_hl(0x9b10a4e5, 0xe9913129), 827, 268 }, + { lexbor_uint64_hl(0xe7109bfb, 0xa19c0c9d), 853, 276 }, + { lexbor_uint64_hl(0xac2820d9, 0x623bf429), 880, 284 }, + { lexbor_uint64_hl(0x80444b5e, 0x7aa7cf85), 907, 292 }, + { lexbor_uint64_hl(0xbf21e440, 0x03acdd2d), 933, 300 }, + { lexbor_uint64_hl(0x8e679c2f, 0x5e44ff8f), 960, 308 }, + { lexbor_uint64_hl(0xd433179d, 0x9c8cb841), 986, 316 }, + { lexbor_uint64_hl(0x9e19db92, 0xb4e31ba9), 1013, 324 }, + { lexbor_uint64_hl(0xeb96bf6e, 0xbadf77d9), 1039, 332 }, + { lexbor_uint64_hl(0xaf87023b, 0x9bf0ee6b), 1066, 340 }, +}; + + +#define LEXBOR_DIYFP_D_1_LOG2_10 0.30102999566398114 /* 1 / log2(10). */ + + +lexbor_diyfp_t +lexbor_cached_power_dec(int exp, int *dec_exp) +{ + unsigned int index; + const lexbor_diyfp_cpe_t *cp; + + index = (exp + LEXBOR_DECIMAL_EXPONENT_OFF) / LEXBOR_DECIMAL_EXPONENT_DIST; + cp = &lexbor_cached_powers[index]; + + *dec_exp = cp->dec_exp; + + return lexbor_diyfp(cp->significand, cp->bin_exp); +} + +lexbor_diyfp_t +lexbor_cached_power_bin(int exp, int *dec_exp) +{ + int k; + unsigned int index; + const lexbor_diyfp_cpe_t *cp; + + k = (int) ceil((-61 - exp) * LEXBOR_DIYFP_D_1_LOG2_10) + + LEXBOR_DECIMAL_EXPONENT_OFF - 1; + + index = (unsigned) (k >> 3) + 1; + + cp = &lexbor_cached_powers[index]; + + *dec_exp = -(LEXBOR_DECIMAL_EXPONENT_MIN + (int) (index << 3)); + + return lexbor_diyfp(cp->significand, cp->bin_exp); +} + + +#undef LEXBOR_DIYFP_D_1_LOG2_10 diff --git a/contrib/url/lexbor/core/diyfp.h b/contrib/url/lexbor/core/diyfp.h new file mode 100644 index 0000000000..47fedb9da9 --- /dev/null +++ b/contrib/url/lexbor/core/diyfp.h @@ -0,0 +1,258 @@ +/* + * Copyright (C) Alexander Borisov + * + * Based on nxt_diyfp.h from NGINX NJS project + * + * Copyright (C) Dmitry Volyntsev + * Copyright (C) NGINX, Inc. + * + * An internal diy_fp implementation. + * For details, see Loitsch, Florian. "Printing floating-point numbers quickly + * and accurately with integers." ACM Sigplan Notices 45.6 (2010): 233-243. + */ + +#ifndef LEXBOR_DIYFP_H +#define LEXBOR_DIYFP_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" + +#include + + +#ifdef __cplusplus +#define lexbor_diyfp(_s, _e) { .significand = (_s), .exp = (int) (_e) } +#else +#define lexbor_diyfp(_s, _e) (lexbor_diyfp_t) \ + { .significand = (_s), .exp = (int) (_e) } +#endif +#define lexbor_uint64_hl(h, l) (((uint64_t) (h) << 32) + (l)) + + +#define LEXBOR_DBL_SIGNIFICAND_SIZE 52 +#define LEXBOR_DBL_EXPONENT_BIAS (0x3FF + LEXBOR_DBL_SIGNIFICAND_SIZE) +#define LEXBOR_DBL_EXPONENT_MIN (-LEXBOR_DBL_EXPONENT_BIAS) +#define LEXBOR_DBL_EXPONENT_MAX (0x7FF - LEXBOR_DBL_EXPONENT_BIAS) +#define LEXBOR_DBL_EXPONENT_DENORMAL (-LEXBOR_DBL_EXPONENT_BIAS + 1) + +#define LEXBOR_DBL_SIGNIFICAND_MASK lexbor_uint64_hl(0x000FFFFF, 0xFFFFFFFF) +#define LEXBOR_DBL_HIDDEN_BIT lexbor_uint64_hl(0x00100000, 0x00000000) +#define LEXBOR_DBL_EXPONENT_MASK lexbor_uint64_hl(0x7FF00000, 0x00000000) + +#define LEXBOR_DIYFP_SIGNIFICAND_SIZE 64 + +#define LEXBOR_SIGNIFICAND_SIZE 53 +#define LEXBOR_SIGNIFICAND_SHIFT (LEXBOR_DIYFP_SIGNIFICAND_SIZE \ + - LEXBOR_DBL_SIGNIFICAND_SIZE) + +#define LEXBOR_DECIMAL_EXPONENT_OFF 348 +#define LEXBOR_DECIMAL_EXPONENT_MIN (-348) +#define LEXBOR_DECIMAL_EXPONENT_MAX 340 +#define LEXBOR_DECIMAL_EXPONENT_DIST 8 + + +typedef struct { + uint64_t significand; + int exp; +} +lexbor_diyfp_t; + + +LXB_API lexbor_diyfp_t +lexbor_cached_power_dec(int exp, int *dec_exp); + +LXB_API lexbor_diyfp_t +lexbor_cached_power_bin(int exp, int *dec_exp); + + +/* + * Inline functions + */ +#ifdef LEXBOR_HAVE_BUILTIN_CLZLL +#define nxt_leading_zeros64(x) (((x) == 0) ? 64 : __builtin_clzll(x)) + +#else + +lxb_inline uint64_t +lexbor_diyfp_leading_zeros64(uint64_t x) +{ + uint64_t n; + + if (x == 0) { + return 64; + } + + n = 0; + + while ((x & 0x8000000000000000) == 0) { + n++; + x <<= 1; + } + + return n; +} + +#endif + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_from_d2(double d) +{ + int biased_exp; + uint64_t significand; + lexbor_diyfp_t r; + + union { + double d; + uint64_t u64; + } + u; + + u.d = d; + + biased_exp = (u.u64 & LEXBOR_DBL_EXPONENT_MASK) + >> LEXBOR_DBL_SIGNIFICAND_SIZE; + significand = u.u64 & LEXBOR_DBL_SIGNIFICAND_MASK; + + if (biased_exp != 0) { + r.significand = significand + LEXBOR_DBL_HIDDEN_BIT; + r.exp = biased_exp - LEXBOR_DBL_EXPONENT_BIAS; + } + else { + r.significand = significand; + r.exp = LEXBOR_DBL_EXPONENT_MIN + 1; + } + + return r; +} + +lxb_inline double +lexbor_diyfp_2d(lexbor_diyfp_t v) +{ + int exp; + uint64_t significand, biased_exp; + + union { + double d; + uint64_t u64; + } + u; + + exp = v.exp; + significand = v.significand; + + while (significand > LEXBOR_DBL_HIDDEN_BIT + LEXBOR_DBL_SIGNIFICAND_MASK) { + significand >>= 1; + exp++; + } + + if (exp >= LEXBOR_DBL_EXPONENT_MAX) { + return INFINITY; + } + + if (exp < LEXBOR_DBL_EXPONENT_DENORMAL) { + return 0.0; + } + + while (exp > LEXBOR_DBL_EXPONENT_DENORMAL + && (significand & LEXBOR_DBL_HIDDEN_BIT) == 0) + { + significand <<= 1; + exp--; + } + + if (exp == LEXBOR_DBL_EXPONENT_DENORMAL + && (significand & LEXBOR_DBL_HIDDEN_BIT) == 0) + { + biased_exp = 0; + + } else { + biased_exp = (uint64_t) (exp + LEXBOR_DBL_EXPONENT_BIAS); + } + + u.u64 = (significand & LEXBOR_DBL_SIGNIFICAND_MASK) + | (biased_exp << LEXBOR_DBL_SIGNIFICAND_SIZE); + + return u.d; +} + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_shift_left(lexbor_diyfp_t v, unsigned shift) +{ + return lexbor_diyfp(v.significand << shift, v.exp - shift); +} + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_shift_right(lexbor_diyfp_t v, unsigned shift) +{ + return lexbor_diyfp(v.significand >> shift, v.exp + shift); +} + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_sub(lexbor_diyfp_t lhs, lexbor_diyfp_t rhs) +{ + return lexbor_diyfp(lhs.significand - rhs.significand, lhs.exp); +} + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_mul(lexbor_diyfp_t lhs, lexbor_diyfp_t rhs) +{ +#ifdef LEXBOR_HAVE_UNSIGNED_INT128 + + uint64_t l, h; + lxb_uint128_t u128; + + u128 = (lxb_uint128_t) (lhs.significand) + * (lxb_uint128_t) (rhs.significand); + + h = u128 >> 64; + l = (uint64_t) u128; + + /* rounding. */ + + if (l & ((uint64_t) 1 << 63)) { + h++; + } + + return lexbor_diyfp(h, lhs.exp + rhs.exp + 64); + +#else + + uint64_t a, b, c, d, ac, bc, ad, bd, tmp; + + a = lhs.significand >> 32; + b = lhs.significand & 0xffffffff; + c = rhs.significand >> 32; + d = rhs.significand & 0xffffffff; + + ac = a * c; + bc = b * c; + ad = a * d; + bd = b * d; + + tmp = (bd >> 32) + (ad & 0xffffffff) + (bc & 0xffffffff); + + /* mult_round. */ + + tmp += 1U << 31; + + return lexbor_diyfp(ac + (ad >> 32) + (bc >> 32) + (tmp >> 32), + lhs.exp + rhs.exp + 64); +#endif +} + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_normalize(lexbor_diyfp_t v) +{ + return lexbor_diyfp_shift_left(v, + (unsigned) lexbor_diyfp_leading_zeros64(v.significand)); +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_DIYFP_H */ diff --git a/contrib/url/lexbor/core/dobject.c b/contrib/url/lexbor/core/dobject.c new file mode 100644 index 0000000000..0f5468fd58 --- /dev/null +++ b/contrib/url/lexbor/core/dobject.c @@ -0,0 +1,187 @@ +/* + * Copyright (C) 2018-2019 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/dobject.h" + + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + #include +#endif + + +lexbor_dobject_t * +lexbor_dobject_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_dobject_t)); +} + +lxb_status_t +lexbor_dobject_init(lexbor_dobject_t *dobject, + size_t chunk_size, size_t struct_size) +{ + lxb_status_t status; + + if (dobject == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (chunk_size == 0 || struct_size == 0) { + return LXB_STATUS_ERROR_WRONG_ARGS; + } + + /* Set params */ + dobject->allocated = 0UL; + dobject->struct_size = struct_size; + + /* Init memory */ + dobject->mem = lexbor_mem_create(); + + status = lexbor_mem_init(dobject->mem, + lexbor_mem_align(chunk_size * dobject->struct_size)); + if (status) { + return status; + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(dobject->mem->chunk->data, + dobject->mem->chunk->size); +#endif + + /* Array */ + dobject->cache = lexbor_array_create(); + + status = lexbor_array_init(dobject->cache, chunk_size); + if (status) + return status; + + return LXB_STATUS_OK; +} + +void +lexbor_dobject_clean(lexbor_dobject_t *dobject) +{ + if (dobject != NULL) { + dobject->allocated = 0UL; + + lexbor_mem_clean(dobject->mem); + lexbor_array_clean(dobject->cache); + } +} + +lexbor_dobject_t * +lexbor_dobject_destroy(lexbor_dobject_t *dobject, bool destroy_self) +{ + if (dobject == NULL) + return NULL; + + dobject->mem = lexbor_mem_destroy(dobject->mem, true); + dobject->cache = lexbor_array_destroy(dobject->cache, true); + + if (destroy_self == true) { + return lexbor_free(dobject); + } + + return dobject; +} + +void * +lexbor_dobject_alloc(lexbor_dobject_t *dobject) +{ + void *data; + + if (lexbor_array_length(dobject->cache) != 0) { + dobject->allocated++; + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + data = lexbor_array_pop(dobject->cache); + ASAN_UNPOISON_MEMORY_REGION(data, dobject->struct_size); + + return data; +#else + return lexbor_array_pop(dobject->cache); +#endif + } + + data = lexbor_mem_alloc(dobject->mem, dobject->struct_size); + if (data == NULL) { + return NULL; + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_UNPOISON_MEMORY_REGION(data, dobject->struct_size); +#endif + + dobject->allocated++; + + return data; +} + +void * +lexbor_dobject_calloc(lexbor_dobject_t *dobject) +{ + void *data = lexbor_dobject_alloc(dobject); + + if (data != NULL) { + memset(data, 0, dobject->struct_size); + } + + return data; +} + +void * +lexbor_dobject_free(lexbor_dobject_t *dobject, void *data) +{ + if (data == NULL) { + return NULL; + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(data, dobject->struct_size); +#endif + + if (lexbor_array_push(dobject->cache, data) == LXB_STATUS_OK) { + dobject->allocated--; + return NULL; + } + + return data; +} + +void * +lexbor_dobject_by_absolute_position(lexbor_dobject_t *dobject, size_t pos) +{ + size_t chunk_idx, chunk_pos, i; + lexbor_mem_chunk_t *chunk; + + if (pos >= dobject->allocated) { + return NULL; + } + + chunk = dobject->mem->chunk_first; + chunk_pos = pos * dobject->struct_size; + chunk_idx = chunk_pos / dobject->mem->chunk_min_size; + + for (i = 0; i < chunk_idx; i++) { + chunk = chunk->next; + } + + return &chunk->data[chunk_pos % chunk->size]; +} + +/* + * No inline functions for ABI. + */ +size_t +lexbor_dobject_allocated_noi(lexbor_dobject_t *dobject) +{ + return lexbor_dobject_allocated(dobject); +} + +size_t +lexbor_dobject_cache_length_noi(lexbor_dobject_t *dobject) +{ + return lexbor_dobject_cache_length(dobject); +} diff --git a/contrib/url/lexbor/core/dobject.h b/contrib/url/lexbor/core/dobject.h new file mode 100644 index 0000000000..c92930cfa9 --- /dev/null +++ b/contrib/url/lexbor/core/dobject.h @@ -0,0 +1,92 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_DOBJECT_H +#define LEXBOR_DOBJECT_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" +#include "lexbor/core/mem.h" +#include "lexbor/core/array.h" + + +typedef struct { + lexbor_mem_t *mem; + lexbor_array_t *cache; + + size_t allocated; + size_t struct_size; +} +lexbor_dobject_t; + + +LXB_API lexbor_dobject_t * +lexbor_dobject_create(void); + +LXB_API lxb_status_t +lexbor_dobject_init(lexbor_dobject_t *dobject, + size_t chunk_size, size_t struct_size); + +LXB_API void +lexbor_dobject_clean(lexbor_dobject_t *dobject); + +LXB_API lexbor_dobject_t * +lexbor_dobject_destroy(lexbor_dobject_t *dobject, bool destroy_self); + + +LXB_API uint8_t * +lexbor_dobject_init_list_entries(lexbor_dobject_t *dobject, size_t pos); + + +LXB_API void * +lexbor_dobject_alloc(lexbor_dobject_t *dobject); + +LXB_API void * +lexbor_dobject_calloc(lexbor_dobject_t *dobject); + +LXB_API void * +lexbor_dobject_free(lexbor_dobject_t *dobject, void *data); + + +LXB_API void * +lexbor_dobject_by_absolute_position(lexbor_dobject_t *dobject, size_t pos); + + +/* + * Inline functions + */ +lxb_inline size_t +lexbor_dobject_allocated(lexbor_dobject_t *dobject) +{ + return dobject->allocated; +} + +lxb_inline size_t +lexbor_dobject_cache_length(lexbor_dobject_t *dobject) +{ + return lexbor_array_length(dobject->cache); +} + +/* + * No inline functions for ABI. + */ +LXB_API size_t +lexbor_dobject_allocated_noi(lexbor_dobject_t *dobject); + +LXB_API size_t +lexbor_dobject_cache_length_noi(lexbor_dobject_t *dobject); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_DOBJECT_H */ + + diff --git a/contrib/url/lexbor/core/dtoa.c b/contrib/url/lexbor/core/dtoa.c new file mode 100644 index 0000000000..b1a4ee0325 --- /dev/null +++ b/contrib/url/lexbor/core/dtoa.c @@ -0,0 +1,404 @@ +/* + * Copyright (C) Alexander Borisov + * + * Based on nxt_dtoa.c from NGINX NJS project + * + * Copyright (C) Dmitry Volyntsev + * Copyright (C) NGINX, Inc. + * + * Grisu2 algorithm implementation for printing floating-point numbers based + * upon the work of Milo Yip and Doug Currie. + * + * For algorithm information, see Loitsch, Florian. "Printing + * floating-point numbers quickly and accurately with integers." ACM Sigplan + * Notices 45.6 (2010): 233-243. + * + * Copyright (C) 2015 Doug Currie + * based on dtoa_milo.h + * Copyright (C) 2014 Milo Yip + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "lexbor/core/str.h" +#include "lexbor/core/diyfp.h" +#include "lexbor/core/dtoa.h" + +#include +#include + + +lxb_inline void +lexbor_grisu2_round(lxb_char_t *start, size_t len, uint64_t delta, uint64_t rest, + uint64_t ten_kappa, uint64_t wp_w) +{ + while (rest < wp_w && delta - rest >= ten_kappa + && (rest + ten_kappa < wp_w || /* closer */ + wp_w - rest > rest + ten_kappa - wp_w)) + { + start[len - 1]--; + rest += ten_kappa; + } +} + +lxb_inline int +lexbor_dec_count(uint32_t n) +{ + if (n < 10) return 1; + if (n < 100) return 2; + if (n < 1000) return 3; + if (n < 10000) return 4; + if (n < 100000) return 5; + if (n < 1000000) return 6; + if (n < 10000000) return 7; + if (n < 100000000) return 8; + if (n < 1000000000) return 9; + + return 10; +} + +lxb_inline size_t +lexbor_grisu2_gen(lexbor_diyfp_t W, lexbor_diyfp_t Mp, uint64_t delta, + lxb_char_t *begin, lxb_char_t *end, int *dec_exp) +{ + int kappa; + lxb_char_t c, *p; + uint32_t p1, d; + uint64_t p2, tmp; + lexbor_diyfp_t one, wp_w; + + static const uint64_t pow10[] = { + 1, + 10, + 100, + 1000, + 10000, + 100000, + 1000000, + 10000000, + 100000000, + 1000000000 + }; + + wp_w = lexbor_diyfp_sub(Mp, W); + + one = lexbor_diyfp((uint64_t) 1 << -Mp.exp, Mp.exp); + p1 = (uint32_t) (Mp.significand >> -one.exp); + p2 = Mp.significand & (one.significand - 1); + + p = begin; + + /* GCC 4.2 complains about uninitialized d. */ + d = 0; + + kappa = lexbor_dec_count(p1); + + while (kappa > 0) { + switch (kappa) { + case 10: d = p1 / 1000000000; p1 %= 1000000000; break; + case 9: d = p1 / 100000000; p1 %= 100000000; break; + case 8: d = p1 / 10000000; p1 %= 10000000; break; + case 7: d = p1 / 1000000; p1 %= 1000000; break; + case 6: d = p1 / 100000; p1 %= 100000; break; + case 5: d = p1 / 10000; p1 %= 10000; break; + case 4: d = p1 / 1000; p1 %= 1000; break; + case 3: d = p1 / 100; p1 %= 100; break; + case 2: d = p1 / 10; p1 %= 10; break; + case 1: d = p1; p1 = 0; break; + default: + /* Never go here. */ + return 0; + } + + if (d != 0 || p != begin) { + *p = '0' + d; + + p += 1; + if (p == end) { + return p - begin; + } + } + + kappa--; + + tmp = ((uint64_t) p1 << -one.exp) + p2; + + if (tmp <= delta) { + *dec_exp += kappa; + lexbor_grisu2_round(begin, p - begin, delta, tmp, + pow10[kappa] << -one.exp, wp_w.significand); + return p - begin; + } + } + + /* kappa = 0. */ + + for ( ;; ) { + p2 *= 10; + delta *= 10; + c = (char) (p2 >> -one.exp); + + if (c != 0 || p != begin) { + *p = '0' + c; + + p += 1; + if (p == end) { + return p - begin; + } + } + + p2 &= one.significand - 1; + kappa--; + + if (p2 < delta) { + *dec_exp += kappa; + tmp = (-kappa < 10) ? pow10[-kappa] : 0; + lexbor_grisu2_round(begin, p - begin, delta, p2, one.significand, + wp_w.significand * tmp); + break; + } + } + + return p - begin; +} + +lxb_inline lexbor_diyfp_t +lexbor_diyfp_normalize_boundary(lexbor_diyfp_t v) +{ + while ((v.significand & (LEXBOR_DBL_HIDDEN_BIT << 1)) == 0) { + v.significand <<= 1; + v.exp--; + } + + return lexbor_diyfp_shift_left(v, LEXBOR_SIGNIFICAND_SHIFT - 2); +} + +lxb_inline void +lexbor_diyfp_normalize_boundaries(lexbor_diyfp_t v, lexbor_diyfp_t* minus, + lexbor_diyfp_t* plus) +{ + lexbor_diyfp_t pl, mi; + + pl = lexbor_diyfp_normalize_boundary(lexbor_diyfp((v.significand << 1) + 1, + v.exp - 1)); + + if (v.significand == LEXBOR_DBL_HIDDEN_BIT) { + mi = lexbor_diyfp((v.significand << 2) - 1, v.exp - 2); + + } else { + mi = lexbor_diyfp((v.significand << 1) - 1, v.exp - 1); + } + + mi.significand <<= mi.exp - pl.exp; + mi.exp = pl.exp; + + *plus = pl; + *minus = mi; +} + +lxb_inline size_t +lexbor_grisu2(double value, lxb_char_t *begin, lxb_char_t *end, int *dec_exp) +{ + lexbor_diyfp_t v, w_m, w_p, c_mk, W, Wp, Wm; + + v = lexbor_diyfp_from_d2(value); + + lexbor_diyfp_normalize_boundaries(v, &w_m, &w_p); + + c_mk = lexbor_cached_power_bin(w_p.exp, dec_exp); + W = lexbor_diyfp_mul(lexbor_diyfp_normalize(v), c_mk); + + Wp = lexbor_diyfp_mul(w_p, c_mk); + Wm = lexbor_diyfp_mul(w_m, c_mk); + + Wm.significand++; + Wp.significand--; + + return lexbor_grisu2_gen(W, Wp, Wp.significand - Wm.significand, begin, end, + dec_exp); +} + +lxb_inline size_t +lexbor_write_exponent(int exp, lxb_char_t *begin, lxb_char_t *end) +{ + char *p; + size_t len; + uint32_t u32; + char buf[4]; + + /* -324 <= exp <= 308. */ + + if ((begin + (sizeof(buf) - 1) + 1) >= end) { + return 0; + } + + if (exp < 0) { + *begin = '-'; + begin += 1; + + exp = -exp; + } + else { + *begin++ = '+'; + } + + u32 = exp; + p = buf + (sizeof(buf) - 1); + + do { + *--p = u32 % 10 + '0'; + u32 /= 10; + } + while (u32 != 0); + + len = buf + (sizeof(buf) - 1) - p; + + memcpy(begin, p, len); + + return len + 1; +} + +lxb_inline size_t +lexbor_prettify(lxb_char_t *begin, lxb_char_t *end, size_t len, int dec_exp) +{ + int kk, offset, length; + size_t size; + + /* 10^(kk-1) <= v < 10^kk */ + + length = (int) len; + kk = length + dec_exp; + + if (length <= kk && kk <= 21) { + /* 1234e7 -> 12340000000 */ + + if (kk - length > 0) { + if ((&begin[length] + (kk - length)) < end) { + memset(&begin[length], '0', kk - length); + } + else { + memset(&begin[length], '0', (end - &begin[length])); + } + } + + return kk; + } + else if (0 < kk && kk <= 21) { + /* 1234e-2 -> 12.34 */ + + if ((&begin[kk + 1] + (length - kk)) >= end) { + return length; + } + + memmove(&begin[kk + 1], &begin[kk], length - kk); + begin[kk] = '.'; + + return (length + 1); + } + else if (-6 < kk && kk <= 0) { + /* 1234e-6 -> 0.001234 */ + + offset = 2 - kk; + if ((&begin[offset] + length) >= end + || (begin + 2) >= end) + { + return length; + } + + memmove(&begin[offset], begin, length); + begin[0] = '0'; + begin[1] = '.'; + + if (offset - 2 > 0) { + if ((&begin[2] + (offset - 2)) >= end) { + return length; + } + + memset(&begin[2], '0', offset - 2); + } + + return (length + offset); + } + else if (length == 1) { + /* 1e30 */ + + if ((begin + 1) >= end) { + return length; + } + + begin[1] = 'e'; + + size = lexbor_write_exponent(kk - 1, &begin[2], end); + + return (size + 2); + } + + /* 1234e30 -> 1.234e33 */ + + if ((&begin[2] + (length - 1)) >= end) { + return length; + } + + memmove(&begin[2], &begin[1], length - 1); + begin[1] = '.'; + begin[length + 1] = 'e'; + + size = lexbor_write_exponent(kk - 1, &begin[length + 2], end); + + return (size + length + 2); +} + +size_t +lexbor_dtoa(double value, lxb_char_t *begin, size_t len) +{ + int dec_exp, minus; + size_t length; + lxb_char_t *end = begin + len; + + if (len == 0) { + return 0; + } + + /* Not handling NaN and inf. */ + + minus = 0; + + if (value == 0) { + *begin = '0'; + + return 1; + } + + if (signbit(value)) { + *begin = '-'; + + begin += 1; + if (begin == end) { + return 1; + } + + value = -value; + minus = 1; + } + + length = lexbor_grisu2(value, begin, end, &dec_exp); + length = lexbor_prettify(begin, end, length, dec_exp); + + return (minus + length); +} diff --git a/contrib/url/lexbor/core/dtoa.h b/contrib/url/lexbor/core/dtoa.h new file mode 100644 index 0000000000..c60c28803b --- /dev/null +++ b/contrib/url/lexbor/core/dtoa.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) Alexander Borisov + * + * Based on nxt_dtoa.h from NGINX NJS project + * + * Copyright (C) Dmitry Volyntsev + * Copyright (C) Nginx, Inc. + */ + +#ifndef LEXBOR_DTOA_H +#define LEXBOR_DTOA_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" + + +LXB_API size_t +lexbor_dtoa(double value, lxb_char_t *begin, size_t len); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_DTOA_H */ diff --git a/contrib/url/lexbor/core/lexbor.h b/contrib/url/lexbor/core/lexbor.h new file mode 100644 index 0000000000..29c4035674 --- /dev/null +++ b/contrib/url/lexbor/core/lexbor.h @@ -0,0 +1,43 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_H +#define LEXBOR_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/def.h" + +typedef void *(*lexbor_memory_malloc_f)(size_t size); +typedef void *(*lexbor_memory_realloc_f)(void *dst, size_t size); +typedef void *(*lexbor_memory_calloc_f)(size_t num, size_t size); +typedef void (*lexbor_memory_free_f)(void *dst); + +LXB_API void * +lexbor_malloc(size_t size); + +LXB_API void * +lexbor_realloc(void *dst, size_t size); + +LXB_API void * +lexbor_calloc(size_t num, size_t size); + +LXB_API void * +lexbor_free(void *dst); + +LXB_API lxb_status_t +lexbor_memory_setup(lexbor_memory_malloc_f new_malloc, lexbor_memory_realloc_f new_realloc, + lexbor_memory_calloc_f new_calloc, lexbor_memory_free_f new_free); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_H */ + diff --git a/contrib/url/lexbor/core/mem.c b/contrib/url/lexbor/core/mem.c new file mode 100644 index 0000000000..9192af42ed --- /dev/null +++ b/contrib/url/lexbor/core/mem.c @@ -0,0 +1,228 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/mem.h" + + +lexbor_mem_t * +lexbor_mem_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_mem_t)); +} + +lxb_status_t +lexbor_mem_init(lexbor_mem_t *mem, size_t min_chunk_size) +{ + if (mem == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (min_chunk_size == 0) { + return LXB_STATUS_ERROR_WRONG_ARGS; + } + + mem->chunk_min_size = lexbor_mem_align(min_chunk_size); + + /* Create first chunk */ + mem->chunk = lexbor_mem_chunk_make(mem, mem->chunk_min_size); + if (mem->chunk == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + mem->chunk_length = 1; + mem->chunk_first = mem->chunk; + + return LXB_STATUS_OK; +} + +void +lexbor_mem_clean(lexbor_mem_t *mem) +{ + lexbor_mem_chunk_t *prev, *chunk; + + if (mem == NULL) { + return; + } + + chunk = mem->chunk; + + while (chunk->prev) { + prev = chunk->prev; + + chunk->data = lexbor_free(chunk->data); + lexbor_free(chunk); + + chunk = prev; + } + + chunk->next = NULL; + chunk->length = 0; + + mem->chunk = mem->chunk_first; + mem->chunk_length = 1; +} + +lexbor_mem_t * +lexbor_mem_destroy(lexbor_mem_t *mem, bool destroy_self) +{ + lexbor_mem_chunk_t *chunk, *prev; + + if (mem == NULL) { + return NULL; + } + + /* Destroy all chunk */ + if (mem->chunk) { + chunk = mem->chunk; + + while (chunk) { + prev = chunk->prev; + lexbor_mem_chunk_destroy(mem, chunk, true); + chunk = prev; + } + + mem->chunk = NULL; + } + + if (destroy_self) { + return lexbor_free(mem); + } + + return mem; +} + +uint8_t * +lexbor_mem_chunk_init(lexbor_mem_t *mem, + lexbor_mem_chunk_t *chunk, size_t length) +{ + length = lexbor_mem_align(length); + + if (length > mem->chunk_min_size) { + if (mem->chunk_min_size > (SIZE_MAX - length)) { + chunk->size = length; + } + else { + chunk->size = length + mem->chunk_min_size; + } + } + else { + chunk->size = mem->chunk_min_size; + } + + chunk->length = 0; + chunk->data = lexbor_malloc(chunk->size * sizeof(uint8_t)); + + return chunk->data; +} + +lexbor_mem_chunk_t * +lexbor_mem_chunk_make(lexbor_mem_t *mem, size_t length) +{ + lexbor_mem_chunk_t *chunk = lexbor_calloc(1, sizeof(lexbor_mem_chunk_t)); + + if (chunk == NULL) { + return NULL; + } + + if (lexbor_mem_chunk_init(mem, chunk, length) == NULL) { + return lexbor_free(chunk); + } + + return chunk; +} + +lexbor_mem_chunk_t * +lexbor_mem_chunk_destroy(lexbor_mem_t *mem, + lexbor_mem_chunk_t *chunk, bool self_destroy) +{ + if (chunk == NULL || mem == NULL) { + return NULL; + } + + if (chunk->data) { + chunk->data = lexbor_free(chunk->data); + } + + if (self_destroy) { + return lexbor_free(chunk); + } + + return chunk; +} + +void * +lexbor_mem_alloc(lexbor_mem_t *mem, size_t length) +{ + if (length == 0) { + return NULL; + } + + length = lexbor_mem_align(length); + + if ((mem->chunk->length + length) > mem->chunk->size) { + if ((SIZE_MAX - mem->chunk_length) == 0) { + return NULL; + } + + mem->chunk->next = lexbor_mem_chunk_make(mem, length); + if (mem->chunk->next == NULL) { + return NULL; + } + + mem->chunk->next->prev = mem->chunk; + mem->chunk = mem->chunk->next; + + mem->chunk_length++; + } + + mem->chunk->length += length; + + return &mem->chunk->data[(mem->chunk->length - length)]; +} + +void * +lexbor_mem_calloc(lexbor_mem_t *mem, size_t length) +{ + void *data = lexbor_mem_alloc(mem, length); + + if (data != NULL) { + memset(data, 0, length); + } + + return data; +} + +/* + * No inline functions for ABI. + */ +size_t +lexbor_mem_current_length_noi(lexbor_mem_t *mem) +{ + return lexbor_mem_current_length(mem); +} + +size_t +lexbor_mem_current_size_noi(lexbor_mem_t *mem) +{ + return lexbor_mem_current_size(mem); +} + +size_t +lexbor_mem_chunk_length_noi(lexbor_mem_t *mem) +{ + return lexbor_mem_chunk_length(mem); +} +size_t +lexbor_mem_align_noi(size_t size) +{ + return lexbor_mem_align(size); +} + +size_t +lexbor_mem_align_floor_noi(size_t size) +{ + return lexbor_mem_align_floor(size); +} diff --git a/contrib/url/lexbor/core/mem.h b/contrib/url/lexbor/core/mem.h new file mode 100644 index 0000000000..f245528d52 --- /dev/null +++ b/contrib/url/lexbor/core/mem.h @@ -0,0 +1,141 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_MEM_H +#define LEXBOR_MEM_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "lexbor/core/base.h" + + +typedef struct lexbor_mem_chunk lexbor_mem_chunk_t; +typedef struct lexbor_mem lexbor_mem_t; + +struct lexbor_mem_chunk { + uint8_t *data; + size_t length; + size_t size; + + lexbor_mem_chunk_t *next; + lexbor_mem_chunk_t *prev; +}; + +struct lexbor_mem { + lexbor_mem_chunk_t *chunk; + lexbor_mem_chunk_t *chunk_first; + + size_t chunk_min_size; + size_t chunk_length; +}; + + +LXB_API lexbor_mem_t * +lexbor_mem_create(void); + +LXB_API lxb_status_t +lexbor_mem_init(lexbor_mem_t *mem, size_t min_chunk_size); + +LXB_API void +lexbor_mem_clean(lexbor_mem_t *mem); + +LXB_API lexbor_mem_t * +lexbor_mem_destroy(lexbor_mem_t *mem, bool destroy_self); + + +/* + * The memory allocated in lexbor_mem_chunk_* functions needs to be freed + * by lexbor_mem_chunk_destroy function. + * + * This memory will not be automatically freed by a function lexbor_mem_destroy. + */ +LXB_API uint8_t * +lexbor_mem_chunk_init(lexbor_mem_t *mem, + lexbor_mem_chunk_t *chunk, size_t length); + +LXB_API lexbor_mem_chunk_t * +lexbor_mem_chunk_make(lexbor_mem_t *mem, size_t length); + +LXB_API lexbor_mem_chunk_t * +lexbor_mem_chunk_destroy(lexbor_mem_t *mem, + lexbor_mem_chunk_t *chunk, bool self_destroy); + +/* + * The memory allocated in lexbor_mem_alloc and lexbor_mem_calloc function + * will be freeds after calling lexbor_mem_destroy function. + */ +LXB_API void * +lexbor_mem_alloc(lexbor_mem_t *mem, size_t length); + +LXB_API void * +lexbor_mem_calloc(lexbor_mem_t *mem, size_t length); + + +/* + * Inline functions + */ +lxb_inline size_t +lexbor_mem_current_length(lexbor_mem_t *mem) +{ + return mem->chunk->length; +} + +lxb_inline size_t +lexbor_mem_current_size(lexbor_mem_t *mem) +{ + return mem->chunk->size; +} + +lxb_inline size_t +lexbor_mem_chunk_length(lexbor_mem_t *mem) +{ + return mem->chunk_length; +} + +lxb_inline size_t +lexbor_mem_align(size_t size) +{ + return ((size % LEXBOR_MEM_ALIGN_STEP) != 0) + ? size + (LEXBOR_MEM_ALIGN_STEP - (size % LEXBOR_MEM_ALIGN_STEP)) + : size; +} + +lxb_inline size_t +lexbor_mem_align_floor(size_t size) +{ + return ((size % LEXBOR_MEM_ALIGN_STEP) != 0) + ? size - (size % LEXBOR_MEM_ALIGN_STEP) + : size; +} + +/* + * No inline functions for ABI. + */ +LXB_API size_t +lexbor_mem_current_length_noi(lexbor_mem_t *mem); + +LXB_API size_t +lexbor_mem_current_size_noi(lexbor_mem_t *mem); + +LXB_API size_t +lexbor_mem_chunk_length_noi(lexbor_mem_t *mem); + +LXB_API size_t +lexbor_mem_align_noi(size_t size); + +LXB_API size_t +lexbor_mem_align_floor_noi(size_t size); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_MEM_H */ diff --git a/contrib/url/lexbor/core/memory.c b/contrib/url/lexbor/core/memory.c new file mode 100644 index 0000000000..038e040440 --- /dev/null +++ b/contrib/url/lexbor/core/memory.c @@ -0,0 +1,53 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/base.h" + +static lexbor_memory_malloc_f lexbor_memory_malloc = malloc; +static lexbor_memory_realloc_f lexbor_memory_realloc = realloc; +static lexbor_memory_calloc_f lexbor_memory_calloc = calloc; +static lexbor_memory_free_f lexbor_memory_free = free; + +void * +lexbor_malloc(size_t size) +{ + return lexbor_memory_malloc(size); +} + +void * +lexbor_realloc(void *dst, size_t size) +{ + return lexbor_memory_realloc(dst, size); +} + +void * +lexbor_calloc(size_t num, size_t size) +{ + return lexbor_memory_calloc(num, size); +} + +void * +lexbor_free(void *dst) +{ + lexbor_memory_free(dst); + return NULL; +} + +lxb_status_t +lexbor_memory_setup(lexbor_memory_malloc_f new_malloc, lexbor_memory_realloc_f new_realloc, + lexbor_memory_calloc_f new_calloc, lexbor_memory_free_f new_free) +{ + if (new_malloc == NULL || new_realloc == NULL || new_calloc == NULL || new_free == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + lexbor_memory_malloc = new_malloc; + lexbor_memory_realloc = new_realloc; + lexbor_memory_calloc = new_calloc; + lexbor_memory_free = new_free; + + return LXB_STATUS_OK; +} diff --git a/contrib/url/lexbor/core/mraw.c b/contrib/url/lexbor/core/mraw.c new file mode 100644 index 0000000000..29d0383c4b --- /dev/null +++ b/contrib/url/lexbor/core/mraw.c @@ -0,0 +1,429 @@ +/* + * Copyright (C) 2018-2019 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/mraw.h" + + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + #include +#endif + + +#define lexbor_mraw_meta_set(data, size) \ + do { \ + memcpy(data, size, sizeof(size_t)); \ + } \ + while (0) + +#define lexbor_mraw_data_begin(data) \ + &((uint8_t *) (data))[ lexbor_mraw_meta_size() ] + + +lxb_inline void * +lexbor_mraw_realloc_tail(lexbor_mraw_t *mraw, void *data, void *begin, + size_t size, size_t begin_len, size_t new_size, + bool *is_valid); + + +lexbor_mraw_t * +lexbor_mraw_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_mraw_t)); +} + +lxb_status_t +lexbor_mraw_init(lexbor_mraw_t *mraw, size_t chunk_size) +{ + lxb_status_t status; + + if (mraw == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (chunk_size == 0) { + return LXB_STATUS_ERROR_WRONG_ARGS; + } + + /* Init memory */ + mraw->mem = lexbor_mem_create(); + + status = lexbor_mem_init(mraw->mem, chunk_size + lexbor_mraw_meta_size()); + if (status) { + return status; + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(mraw->mem->chunk->data, mraw->mem->chunk->size); +#endif + + /* Cache */ + mraw->cache = lexbor_bst_create(); + + status = lexbor_bst_init(mraw->cache, 512); + if (status) { + return status; + } + + mraw->ref_count = 0; + + return LXB_STATUS_OK; +} + +void +lexbor_mraw_clean(lexbor_mraw_t *mraw) +{ + if (mraw != NULL) { + lexbor_mem_clean(mraw->mem); + lexbor_bst_clean(mraw->cache); + + mraw->ref_count = 0; + } +} + +lexbor_mraw_t * +lexbor_mraw_destroy(lexbor_mraw_t *mraw, bool destroy_self) +{ + if (mraw == NULL) { + return NULL; + } + + mraw->mem = lexbor_mem_destroy(mraw->mem, true); + mraw->cache = lexbor_bst_destroy(mraw->cache, true); + + if (destroy_self) { + return lexbor_free(mraw); + } + + return mraw; +} + +lxb_inline void * +lexbor_mraw_mem_alloc(lexbor_mraw_t *mraw, size_t length) +{ + size_t diff; + uint8_t *data; + lexbor_mem_t *mem = mraw->mem; + + if (length == 0) { + return NULL; + } + + if ((mem->chunk->length + length) > mem->chunk->size) { + lexbor_mem_chunk_t *chunk = mem->chunk; + + if ((SIZE_MAX - mem->chunk_length) == 0) { + return NULL; + } + + if (chunk->length == 0) { + lexbor_mem_chunk_destroy(mem, chunk, false); + lexbor_mem_chunk_init(mem, chunk, length); + + chunk->length = length; + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(chunk->data, chunk->size); +#endif + + return chunk->data; + } + + diff = lexbor_mem_align_floor(chunk->size - chunk->length); + + /* Save tail to cache */ + if (diff > lexbor_mraw_meta_size()) { + diff -= lexbor_mraw_meta_size(); + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_UNPOISON_MEMORY_REGION(&chunk->data[chunk->length], + lexbor_mraw_meta_size()); +#endif + + lexbor_mraw_meta_set(&chunk->data[chunk->length], &diff); + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(&chunk->data[chunk->length], + diff + lexbor_mraw_meta_size()); +#endif + + lexbor_bst_insert(mraw->cache, + lexbor_bst_root_ref(mraw->cache), diff, + lexbor_mraw_data_begin(&chunk->data[chunk->length])); + + chunk->length = chunk->size; + } + + chunk->next = lexbor_mem_chunk_make(mem, length); + if (chunk->next == NULL) { + return NULL; + } + + chunk->next->prev = chunk; + mem->chunk = chunk->next; + + mem->chunk_length++; + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(mem->chunk->data, mem->chunk->size); +#endif + } + + data = &mem->chunk->data[ mem->chunk->length ]; + mem->chunk->length += length; + + return data; +} + +void * +lexbor_mraw_alloc(lexbor_mraw_t *mraw, size_t size) +{ + void *data; + + size = lexbor_mem_align(size); + + if (mraw->cache->tree_length != 0) { + data = lexbor_bst_remove_close(mraw->cache, + lexbor_bst_root_ref(mraw->cache), + size, NULL); + if (data != NULL) { + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + uint8_t *real_data = ((uint8_t *) data) - lexbor_mraw_meta_size(); + + /* Set unpoison for current data size */ + ASAN_UNPOISON_MEMORY_REGION(real_data, lexbor_mraw_meta_size()); + + size_t cur_size = lexbor_mraw_data_size(data); + + ASAN_UNPOISON_MEMORY_REGION(real_data, + (cur_size + lexbor_mraw_meta_size())); +#endif + + mraw->ref_count++; + + return data; + } + } + + data = lexbor_mraw_mem_alloc(mraw, (size + lexbor_mraw_meta_size())); + + if (data == NULL) { + return NULL; + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_UNPOISON_MEMORY_REGION(data, (size + lexbor_mraw_meta_size())); +#endif + + mraw->ref_count++; + + lexbor_mraw_meta_set(data, &size); + return lexbor_mraw_data_begin(data); +} + +void * +lexbor_mraw_calloc(lexbor_mraw_t *mraw, size_t size) +{ + void *data = lexbor_mraw_alloc(mraw, size); + + if (data != NULL) { + memset(data, 0, lexbor_mraw_data_size(data)); + } + + return data; +} + +/* + * TODO: I don't really like this interface. Perhaps need to simplify. + */ +lxb_inline void * +lexbor_mraw_realloc_tail(lexbor_mraw_t *mraw, void *data, void *begin, + size_t size, size_t begin_len, size_t new_size, + bool *is_valid) +{ + lexbor_mem_chunk_t *chunk = mraw->mem->chunk; + + if (chunk->size > (begin_len + new_size)) { + *is_valid = true; + + if (new_size == 0) { + chunk->length = begin_len - lexbor_mraw_meta_size(); + return NULL; + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_UNPOISON_MEMORY_REGION(begin, new_size + lexbor_mraw_meta_size()); +#endif + + chunk->length = begin_len + new_size; + memcpy(begin, &new_size, sizeof(size_t)); + + return data; + } + + /* + * If the tail is short then we increase the current data. + */ + if (begin_len == lexbor_mraw_meta_size()) { + void *new_data; + lexbor_mem_chunk_t new_chunk; + + *is_valid = true; + + lexbor_mem_chunk_init(mraw->mem, &new_chunk, + new_size + lexbor_mraw_meta_size()); + if(new_chunk.data == NULL) { + return NULL; + } + + lexbor_mraw_meta_set(new_chunk.data, &new_size); + new_data = lexbor_mraw_data_begin(new_chunk.data); + + if (size != 0) { + memcpy(new_data, data, sizeof(uint8_t) * size); + } + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_UNPOISON_MEMORY_REGION(chunk->data, chunk->size); +#endif + + lexbor_mem_chunk_destroy(mraw->mem, chunk, false); + + chunk->data = new_chunk.data; + chunk->size = new_chunk.size; + chunk->length = new_size + lexbor_mraw_meta_size(); + + return new_data; + } + + *is_valid = false; + + /* + * Next, this piece will go into the cache. + */ + size = lexbor_mem_align_floor(size + (chunk->size - chunk->length)); + memcpy(begin, &size, sizeof(size_t)); + + chunk->length = chunk->size; + + return NULL; +} + +void * +lexbor_mraw_realloc(lexbor_mraw_t *mraw, void *data, size_t new_size) +{ + void *begin; + size_t size, begin_len, diff; + lexbor_mem_chunk_t *chunk = mraw->mem->chunk; + + begin = ((uint8_t *) data) - lexbor_mraw_meta_size(); + memcpy(&size, begin, sizeof(size_t)); + + new_size = lexbor_mem_align(new_size); + + /* + * Look, whether there is an opportunity + * to prolong the current data in chunk? + */ + if (chunk->length >= size) { + begin_len = chunk->length - size; + + if (&chunk->data[begin_len] == data) { + bool is_valid; + void *ptr = lexbor_mraw_realloc_tail(mraw, data, begin, + size, begin_len, new_size, + &is_valid); + if (is_valid == true) { + return ptr; + } + } + } + + if (new_size < size) { + if (new_size == 0) { + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(begin, size + lexbor_mraw_meta_size()); +#endif + mraw->ref_count--; + + lexbor_bst_insert(mraw->cache, lexbor_bst_root_ref(mraw->cache), + size, data); + return NULL; + } + + diff = lexbor_mem_align_floor(size - new_size); + + if (diff > lexbor_mraw_meta_size()) { + memcpy(begin, &new_size, sizeof(size_t)); + + new_size = diff - lexbor_mraw_meta_size(); + begin = &((uint8_t *) data)[diff]; + + lexbor_mraw_meta_set(begin, &new_size); + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + ASAN_POISON_MEMORY_REGION(begin, new_size + lexbor_mraw_meta_size()); +#endif + lexbor_bst_insert(mraw->cache, lexbor_bst_root_ref(mraw->cache), + new_size, lexbor_mraw_data_begin(begin)); + } + + return data; + } + + begin = lexbor_mraw_alloc(mraw, new_size); + if (begin == NULL) { + return NULL; + } + + if (size != 0) { + memcpy(begin, data, sizeof(uint8_t) * size); + } + + lexbor_mraw_free(mraw, data); + + return begin; +} + +void * +lexbor_mraw_free(lexbor_mraw_t *mraw, void *data) +{ + size_t size = lexbor_mraw_data_size(data); + +#if defined(LEXBOR_HAVE_ADDRESS_SANITIZER) + uint8_t *real_data = ((uint8_t *) data) - lexbor_mraw_meta_size(); + ASAN_POISON_MEMORY_REGION(real_data, size + lexbor_mraw_meta_size()); +#endif + + lexbor_bst_insert(mraw->cache, lexbor_bst_root_ref(mraw->cache), + size, data); + + mraw->ref_count--; + + return NULL; +} + +/* + * No inline functions for ABI. + */ +size_t +lexbor_mraw_data_size_noi(void *data) +{ + return lexbor_mraw_data_size(data); +} + +void +lexbor_mraw_data_size_set_noi(void *data, size_t size) +{ + lexbor_mraw_data_size_set(data, size); +} + +void * +lexbor_mraw_dup_noi(lexbor_mraw_t *mraw, const void *src, size_t size) +{ + return lexbor_mraw_dup(mraw, src, size); +} diff --git a/contrib/url/lexbor/core/mraw.h b/contrib/url/lexbor/core/mraw.h new file mode 100644 index 0000000000..1ffb64dca2 --- /dev/null +++ b/contrib/url/lexbor/core/mraw.h @@ -0,0 +1,114 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_MRAW_H +#define LEXBOR_MRAW_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "lexbor/core/base.h" +#include "lexbor/core/mem.h" +#include "lexbor/core/bst.h" + + +#define lexbor_mraw_meta_size() \ + (((sizeof(size_t) % LEXBOR_MEM_ALIGN_STEP) != 0) \ + ? sizeof(size_t) \ + + (LEXBOR_MEM_ALIGN_STEP - (sizeof(size_t) % LEXBOR_MEM_ALIGN_STEP)) \ + : sizeof(size_t)) + + +typedef struct { + lexbor_mem_t *mem; + lexbor_bst_t *cache; + size_t ref_count; +} +lexbor_mraw_t; + + +LXB_API lexbor_mraw_t * +lexbor_mraw_create(void); + +LXB_API lxb_status_t +lexbor_mraw_init(lexbor_mraw_t *mraw, size_t chunk_size); + +LXB_API void +lexbor_mraw_clean(lexbor_mraw_t *mraw); + +LXB_API lexbor_mraw_t * +lexbor_mraw_destroy(lexbor_mraw_t *mraw, bool destroy_self); + + +LXB_API void * +lexbor_mraw_alloc(lexbor_mraw_t *mraw, size_t size); + +LXB_API void * +lexbor_mraw_calloc(lexbor_mraw_t *mraw, size_t size); + +LXB_API void * +lexbor_mraw_realloc(lexbor_mraw_t *mraw, void *data, size_t new_size); + +LXB_API void * +lexbor_mraw_free(lexbor_mraw_t *mraw, void *data); + + +/* + * Inline functions + */ +lxb_inline size_t +lexbor_mraw_data_size(void *data) +{ + return *((size_t *) (((uint8_t *) data) - lexbor_mraw_meta_size())); +} + +lxb_inline void +lexbor_mraw_data_size_set(void *data, size_t size) +{ + data = (((uint8_t *) data) - lexbor_mraw_meta_size()); + memcpy(data, &size, sizeof(size_t)); +} + +lxb_inline void * +lexbor_mraw_dup(lexbor_mraw_t *mraw, const void *src, size_t size) +{ + void *data = lexbor_mraw_alloc(mraw, size); + + if (data != NULL) { + memcpy(data, src, size); + } + + return data; +} + +lxb_inline size_t +lexbor_mraw_reference_count(lexbor_mraw_t *mraw) +{ + return mraw->ref_count; +} + + +/* + * No inline functions for ABI. + */ +LXB_API size_t +lexbor_mraw_data_size_noi(void *data); + +LXB_API void +lexbor_mraw_data_size_set_noi(void *data, size_t size); + +LXB_API void * +lexbor_mraw_dup_noi(lexbor_mraw_t *mraw, const void *src, size_t size); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_MRAW_H */ diff --git a/contrib/url/lexbor/core/plog.c b/contrib/url/lexbor/core/plog.c new file mode 100644 index 0000000000..71344b0a2a --- /dev/null +++ b/contrib/url/lexbor/core/plog.c @@ -0,0 +1,73 @@ +/* + * Copyright (C) 2019 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/plog.h" + + +lxb_status_t +lexbor_plog_init(lexbor_plog_t *plog, size_t init_size, size_t struct_size) +{ + lxb_status_t status; + + if (plog == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + if (struct_size < sizeof(lexbor_plog_entry_t)) { + struct_size = sizeof(lexbor_plog_entry_t); + } + + status = lexbor_array_obj_init(&plog->list, init_size, struct_size); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_OK; +} + +lexbor_plog_t * +lexbor_plog_destroy(lexbor_plog_t *plog, bool self_destroy) +{ + if (plog == NULL) { + return NULL; + } + + lexbor_array_obj_destroy(&plog->list, false); + + if (self_destroy) { + return lexbor_free(plog); + } + + return plog; +} + +/* + * No inline functions. + */ +lexbor_plog_t * +lexbor_plog_create_noi(void) +{ + return lexbor_plog_create(); +} + +void +lexbor_plog_clean_noi(lexbor_plog_t *plog) +{ + lexbor_plog_clean(plog); +} + +void * +lexbor_plog_push_noi(lexbor_plog_t *plog, const lxb_char_t *data, void *ctx, + unsigned id) +{ + return lexbor_plog_push(plog, data, ctx, id); +} + +size_t +lexbor_plog_length_noi(lexbor_plog_t *plog) +{ + return lexbor_plog_length(plog); +} diff --git a/contrib/url/lexbor/core/plog.h b/contrib/url/lexbor/core/plog.h new file mode 100644 index 0000000000..91b35bfcdf --- /dev/null +++ b/contrib/url/lexbor/core/plog.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2019 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_PLOG_H +#define LEXBOR_PLOG_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/array_obj.h" + + +typedef struct { + const lxb_char_t *data; + void *context; + unsigned id; +} +lexbor_plog_entry_t; + +typedef struct { + lexbor_array_obj_t list; +} +lexbor_plog_t; + + +LXB_API lxb_status_t +lexbor_plog_init(lexbor_plog_t *plog, size_t init_size, size_t struct_size); + +LXB_API lexbor_plog_t * +lexbor_plog_destroy(lexbor_plog_t *plog, bool self_destroy); + + +/* + * Inline functions + */ +lxb_inline lexbor_plog_t * +lexbor_plog_create(void) +{ + return (lexbor_plog_t *) lexbor_calloc(1, sizeof(lexbor_plog_t)); +} + +lxb_inline void +lexbor_plog_clean(lexbor_plog_t *plog) +{ + lexbor_array_obj_clean(&plog->list); +} + +lxb_inline void * +lexbor_plog_push(lexbor_plog_t *plog, const lxb_char_t *data, void *ctx, + unsigned id) +{ + lexbor_plog_entry_t *entry; + + if (plog == NULL) { + return NULL; + } + + entry = (lexbor_plog_entry_t *) lexbor_array_obj_push(&plog->list); + if (entry == NULL) { + return NULL; + } + + entry->data = data; + entry->context = ctx; + entry->id = id; + + return (void *) entry; +} + +lxb_inline size_t +lexbor_plog_length(lexbor_plog_t *plog) +{ + return lexbor_array_obj_length(&plog->list); +} + +/* + * No inline functions for ABI. + */ +LXB_API lexbor_plog_t * +lexbor_plog_create_noi(void); + +LXB_API void +lexbor_plog_clean_noi(lexbor_plog_t *plog); + +LXB_API void * +lexbor_plog_push_noi(lexbor_plog_t *plog, const lxb_char_t *data, void *ctx, + unsigned id); + +LXB_API size_t +lexbor_plog_length_noi(lexbor_plog_t *plog); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_PLOG_H */ + diff --git a/contrib/url/lexbor/core/serialize.h b/contrib/url/lexbor/core/serialize.h new file mode 100644 index 0000000000..1efdd8d664 --- /dev/null +++ b/contrib/url/lexbor/core/serialize.h @@ -0,0 +1,24 @@ +/* + * Copyright (C) 2021 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_SERIALIZE_H +#define LEXBOR_SERIALIZE_H + + +#include "lexbor/core/base.h" + + +#define lexbor_serialize_write(cb, data, length, ctx, status) \ + do { \ + (status) = (cb)((lxb_char_t *) (data), (length), (ctx)); \ + if ((status) != LXB_STATUS_OK) { \ + return (status); \ + } \ + } \ + while (false) + + +#endif /* LEXBOR_SERIALIZE_H */ diff --git a/contrib/url/lexbor/core/shs.c b/contrib/url/lexbor/core/shs.c new file mode 100644 index 0000000000..679143bf45 --- /dev/null +++ b/contrib/url/lexbor/core/shs.c @@ -0,0 +1,118 @@ +/* + * Copyright (C) 2018-2019 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/shs.h" +#include "lexbor/core/str.h" + +#define LEXBOR_STR_RES_MAP_LOWERCASE +#define LEXBOR_STR_RES_MAP_UPPERCASE +#include "lexbor/core/str_res.h" + + +#define lexbor_shs_make_id_m(key, size, table_size) \ + (((((key[0] * key[size - 1]) * key[0]) + size) % table_size) + 0x01) + +#define lexbor_shs_make_id_lower_m(key, size, table_size) \ + (((((lexbor_str_res_map_lowercase[key[0]] \ + * lexbor_str_res_map_lowercase[key[size - 1]]) \ + * lexbor_str_res_map_lowercase[key[0]]) \ + + size) \ + % table_size) + 0x01) + +#define lexbor_shs_make_id_upper_m(key, size, table_size) \ + (((((lexbor_str_res_map_uppercase[key[0]] \ + * lexbor_str_res_map_uppercase[key[size - 1]]) \ + * lexbor_str_res_map_uppercase[key[0]]) \ + + size) \ + % table_size) + 0x01) + + +const lexbor_shs_entry_t * +lexbor_shs_entry_get_static(const lexbor_shs_entry_t *root, + const lxb_char_t *key, size_t key_len) +{ + const lexbor_shs_entry_t *entry; + entry = root + lexbor_shs_make_id_m(key, key_len, root->key_len); + + while (entry->key != NULL) + { + if (entry->key_len == key_len) { + if (lexbor_str_data_ncmp((const lxb_char_t *) entry->key, + key, key_len)) + { + return entry; + } + + entry = &root[entry->next]; + } + else if (entry->key_len > key_len) { + return NULL; + } + else { + entry = &root[entry->next]; + } + } + + return NULL; +} + +const lexbor_shs_entry_t * +lexbor_shs_entry_get_lower_static(const lexbor_shs_entry_t *root, + const lxb_char_t *key, size_t key_len) +{ + const lexbor_shs_entry_t *entry; + entry = root + lexbor_shs_make_id_lower_m(key, key_len, root->key_len); + + while (entry->key != NULL) + { + if (entry->key_len == key_len) { + if (lexbor_str_data_nlocmp_right((const lxb_char_t *) entry->key, + key, key_len)) + { + return entry; + } + + entry = &root[entry->next]; + } + else if (entry->key_len > key_len) { + return NULL; + } + else { + entry = &root[entry->next]; + } + } + + return NULL; +} + +const lexbor_shs_entry_t * +lexbor_shs_entry_get_upper_static(const lexbor_shs_entry_t *root, + const lxb_char_t *key, size_t key_len) +{ + const lexbor_shs_entry_t *entry; + entry = root + lexbor_shs_make_id_upper_m(key, key_len, root->key_len); + + while (entry->key != NULL) + { + if (entry->key_len == key_len) { + if (lexbor_str_data_nupcmp_right((const lxb_char_t *) entry->key, + key, key_len)) + { + return entry; + } + + entry = &root[entry->next]; + } + else if (entry->key_len > key_len) { + return NULL; + } + else { + entry = &root[entry->next]; + } + } + + return NULL; +} diff --git a/contrib/url/lexbor/core/shs.h b/contrib/url/lexbor/core/shs.h new file mode 100644 index 0000000000..7a63a07409 --- /dev/null +++ b/contrib/url/lexbor/core/shs.h @@ -0,0 +1,82 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_SHS_H +#define LEXBOR_SHS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include + +#include "lexbor/core/base.h" + + +typedef struct { + char *key; + void *value; + + size_t key_len; + size_t next; +} +lexbor_shs_entry_t; + +typedef struct { + uint32_t key; + void *value; + + size_t next; +} +lexbor_shs_hash_t; + + +LXB_API const lexbor_shs_entry_t * +lexbor_shs_entry_get_static(const lexbor_shs_entry_t *tree, + const lxb_char_t *key, size_t size); + +LXB_API const lexbor_shs_entry_t * +lexbor_shs_entry_get_lower_static(const lexbor_shs_entry_t *root, + const lxb_char_t *key, size_t key_len); + +LXB_API const lexbor_shs_entry_t * +lexbor_shs_entry_get_upper_static(const lexbor_shs_entry_t *root, + const lxb_char_t *key, size_t key_len); + +/* + * Inline functions + */ +lxb_inline const lexbor_shs_hash_t * +lexbor_shs_hash_get_static(const lexbor_shs_hash_t *table, + const size_t table_size, const uint32_t key) +{ + const lexbor_shs_hash_t *entry; + + entry = &table[ (key % table_size) + 1 ]; + + do { + if (entry->key == key) { + return entry; + } + + entry = &table[entry->next]; + } + while (entry != table); + + return NULL; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_SHS_H */ + + + + + diff --git a/contrib/url/lexbor/core/str.c b/contrib/url/lexbor/core/str.c new file mode 100644 index 0000000000..0f04286bde --- /dev/null +++ b/contrib/url/lexbor/core/str.c @@ -0,0 +1,642 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include "lexbor/core/str.h" + +#define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER +#define LEXBOR_STR_RES_MAP_LOWERCASE +#define LEXBOR_STR_RES_MAP_UPPERCASE +#include "lexbor/core/str_res.h" + + +lexbor_str_t * +lexbor_str_create(void) +{ + return lexbor_calloc(1, sizeof(lexbor_str_t)); +} + +lxb_char_t * +lexbor_str_init(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t size) +{ + if (str == NULL) { + return NULL; + } + + str->data = lexbor_mraw_alloc(mraw, (size + 1)); + str->length = 0; + + if (str->data != NULL) { + *str->data = '\0'; + } + + return str->data; +} + +lxb_char_t * +lexbor_str_init_append(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *data, size_t length) +{ + lxb_char_t *p; + + if (str == NULL) { + return NULL; + } + + p = lexbor_mraw_alloc(mraw, (length + 1)); + if (p == NULL) { + return NULL; + } + + memcpy(p, data, length); + + p[length] = '\0'; + + str->data = p; + str->length = length; + + return p; +} + +void +lexbor_str_clean(lexbor_str_t *str) +{ + str->length = 0; +} + +void +lexbor_str_clean_all(lexbor_str_t *str) +{ + memset(str, 0, sizeof(lexbor_str_t)); +} + +lexbor_str_t * +lexbor_str_destroy(lexbor_str_t *str, lexbor_mraw_t *mraw, bool destroy_obj) +{ + if (str == NULL) { + return NULL; + } + + if (str->data != NULL) { + str->data = lexbor_mraw_free(mraw, str->data); + } + + if (destroy_obj) { + return lexbor_free(str); + } + + return str; +} + +lxb_char_t * +lexbor_str_realloc(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t new_size) +{ + lxb_char_t *tmp = lexbor_mraw_realloc(mraw, str->data, new_size); + if (tmp == NULL) { + return NULL; + } + + str->data = tmp; + + return tmp; +} + +lxb_char_t * +lexbor_str_check_size(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t plus_len) +{ + lxb_char_t *tmp; + + if (str->length > (SIZE_MAX - plus_len)) { + return NULL; + } + + if ((str->length + plus_len) <= lexbor_str_size(str)) { + return str->data; + } + + tmp = lexbor_mraw_realloc(mraw, str->data, (str->length + plus_len)); + if (tmp == NULL) { + return NULL; + } + + str->data = tmp; + + return tmp; +} + +/* Append API */ +lxb_char_t * +lexbor_str_append(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *buff, size_t length) +{ + lxb_char_t *data_begin; + + lexbor_str_check_size_arg_m(str, lexbor_str_size(str), + mraw, (length + 1), NULL); + + data_begin = &str->data[str->length]; + memcpy(data_begin, buff, sizeof(lxb_char_t) * length); + + str->length += length; + str->data[str->length] = '\0'; + + return data_begin; +} + +lxb_char_t * +lexbor_str_append_before(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *buff, size_t length) +{ + lxb_char_t *data_begin; + + lexbor_str_check_size_arg_m(str, lexbor_str_size(str), + mraw, (length + 1), NULL); + + data_begin = &str->data[str->length]; + + memmove(&str->data[length], str->data, sizeof(lxb_char_t) * str->length); + memcpy(str->data, buff, sizeof(lxb_char_t) * length); + + str->length += length; + str->data[str->length] = '\0'; + + return data_begin; +} + +lxb_char_t * +lexbor_str_append_one(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t data) +{ + lexbor_str_check_size_arg_m(str, lexbor_str_size(str), mraw, 2, NULL); + + str->data[str->length] = data; + + str->length += 1; + str->data[str->length] = '\0'; + + return &str->data[(str->length - 1)]; +} + +lxb_char_t * +lexbor_str_append_lowercase(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *data, size_t length) +{ + size_t i; + lxb_char_t *data_begin; + + lexbor_str_check_size_arg_m(str, lexbor_str_size(str), + mraw, (length + 1), NULL); + + data_begin = &str->data[str->length]; + + for (i = 0; i < length; i++) { + data_begin[i] = lexbor_str_res_map_lowercase[ data[i] ]; + } + + data_begin[i] = '\0'; + str->length += length; + + return data_begin; +} + +lxb_char_t * +lexbor_str_append_with_rep_null_chars(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *buff, size_t length) +{ + const lxb_char_t *pos, *res, *end; + size_t current_len = str->length; + + lexbor_str_check_size_arg_m(str, lexbor_str_size(str), + mraw, (length + 1), NULL); + end = buff + length; + + while (buff != end) { + pos = memchr(buff, '\0', sizeof(lxb_char_t) * (end - buff)); + if (pos == NULL) { + break; + } + + res = lexbor_str_append(str, mraw, buff, (pos - buff)); + if (res == NULL) { + return NULL; + } + + res = lexbor_str_append(str, mraw, + lexbor_str_res_ansi_replacement_character, + sizeof(lexbor_str_res_ansi_replacement_character) - 1); + if (res == NULL) { + return NULL; + } + + buff = pos + 1; + } + + if (buff != end) { + res = lexbor_str_append(str, mraw, buff, (end - buff)); + if (res == NULL) { + return NULL; + } + } + + return &str->data[current_len]; +} + +lxb_char_t * +lexbor_str_copy(lexbor_str_t *dest, const lexbor_str_t *target, + lexbor_mraw_t *mraw) +{ + if (target->data == NULL) { + return NULL; + } + + if (dest->data == NULL) { + lexbor_str_init(dest, mraw, target->length); + + if (dest->data == NULL) { + return NULL; + } + } + + return lexbor_str_append(dest, mraw, target->data, target->length); +} + +void +lexbor_str_stay_only_whitespace(lexbor_str_t *target) +{ + size_t i, pos = 0; + lxb_char_t *data = target->data; + + for (i = 0; i < target->length; i++) { + if (lexbor_utils_whitespace(data[i], ==, ||)) { + data[pos] = data[i]; + pos++; + } + } + + target->length = pos; +} + +void +lexbor_str_strip_collapse_whitespace(lexbor_str_t *target) +{ + size_t i, offset, ws_i; + lxb_char_t *data = target->data; + + if (target->length == 0) { + return; + } + + if (lexbor_utils_whitespace(*data, ==, ||)) { + *data = 0x20; + } + + for (i = 0, offset = 0, ws_i = 0; i < target->length; i++) + { + if (lexbor_utils_whitespace(data[i], ==, ||)) { + if (data[ws_i] != 0x20) { + data[offset] = 0x20; + + ws_i = offset; + offset++; + } + } + else { + if (data[ws_i] == 0x20) { + ws_i = offset; + } + + data[offset] = data[i]; + offset++; + } + } + + if (offset != i) { + if (offset != 0) { + if (data[offset - 1] == 0x20) { + offset--; + } + } + + data[offset] = 0x00; + target->length = offset; + } +} + +size_t +lexbor_str_crop_whitespace_from_begin(lexbor_str_t *target) +{ + size_t i; + lxb_char_t *data = target->data; + + for (i = 0; i < target->length; i++) { + if (lexbor_utils_whitespace(data[i], !=, &&)) { + break; + } + } + + if (i != 0 && i != target->length) { + memmove(target->data, &target->data[i], (target->length - i)); + } + + target->length -= i; + return i; +} + +size_t +lexbor_str_whitespace_from_begin(lexbor_str_t *target) +{ + size_t i; + lxb_char_t *data = target->data; + + for (i = 0; i < target->length; i++) { + if (lexbor_utils_whitespace(data[i], !=, &&)) { + break; + } + } + + return i; +} + +size_t +lexbor_str_whitespace_from_end(lexbor_str_t *target) +{ + size_t i = target->length; + lxb_char_t *data = target->data; + + while (i) { + i--; + + if (lexbor_utils_whitespace(data[i], !=, &&)) { + return target->length - (i + 1); + } + } + + return 0; +} + +/* + * Data utils + * TODO: All functions need optimization. + */ +const lxb_char_t * +lexbor_str_data_ncasecmp_first(const lxb_char_t *first, const lxb_char_t *sec, + size_t sec_size) +{ + size_t i; + + for (i = 0; i < sec_size; i++) { + if (first[i] == '\0') { + return &first[i]; + } + + if (lexbor_str_res_map_lowercase[ first[i] ] + != lexbor_str_res_map_lowercase[ sec[i] ]) + { + return NULL; + } + } + + return &first[i]; +} + +bool +lexbor_str_data_ncasecmp_end(const lxb_char_t *first, const lxb_char_t *sec, + size_t size) +{ + while (size != 0) { + size--; + + if (lexbor_str_res_map_lowercase[ first[size] ] + != lexbor_str_res_map_lowercase[ sec[size] ]) + { + return false; + } + } + + return true; +} + +bool +lexbor_str_data_ncasecmp_contain(const lxb_char_t *where, size_t where_size, + const lxb_char_t *what, size_t what_size) +{ + for (size_t i = 0; what_size <= (where_size - i); i++) { + if(lexbor_str_data_ncasecmp(&where[i], what, what_size)) { + return true; + } + } + + return false; +} + +bool +lexbor_str_data_ncasecmp(const lxb_char_t *first, const lxb_char_t *sec, + size_t size) +{ + for (size_t i = 0; i < size; i++) { + if (lexbor_str_res_map_lowercase[ first[i] ] + != lexbor_str_res_map_lowercase[ sec[i] ]) + { + return false; + } + } + + return true; +} + +bool +lexbor_str_data_nlocmp_right(const lxb_char_t *first, const lxb_char_t *sec, + size_t size) +{ + for (size_t i = 0; i < size; i++) { + if (first[i] != lexbor_str_res_map_lowercase[ sec[i] ]) { + return false; + } + } + + return true; +} + +bool +lexbor_str_data_nupcmp_right(const lxb_char_t *first, const lxb_char_t *sec, + size_t size) +{ + for (size_t i = 0; i < size; i++) { + if (first[i] != lexbor_str_res_map_uppercase[ sec[i] ]) { + return false; + } + } + + return true; +} + +bool +lexbor_str_data_casecmp(const lxb_char_t *first, const lxb_char_t *sec) +{ + for (;;) { + if (lexbor_str_res_map_lowercase[*first] + != lexbor_str_res_map_lowercase[*sec]) + { + return false; + } + + if (*first == '\0') { + return true; + } + + first++; + sec++; + } +} + +bool +lexbor_str_data_ncmp_end(const lxb_char_t *first, const lxb_char_t *sec, + size_t size) +{ + while (size != 0) { + size--; + + if (first[size] != sec[size]) { + return false; + } + } + + return true; +} + +bool +lexbor_str_data_ncmp_contain(const lxb_char_t *where, size_t where_size, + const lxb_char_t *what, size_t what_size) +{ + for (size_t i = 0; what_size <= (where_size - i); i++) { + if(memcmp(&where[i], what, sizeof(lxb_char_t) * what_size) == 0) { + return true; + } + } + + return false; +} + +bool +lexbor_str_data_ncmp(const lxb_char_t *first, const lxb_char_t *sec, + size_t size) +{ + return memcmp(first, sec, sizeof(lxb_char_t) * size) == 0; +} + +bool +lexbor_str_data_cmp(const lxb_char_t *first, const lxb_char_t *sec) +{ + for (;;) { + if (*first != *sec) { + return false; + } + + if (*first == '\0') { + return true; + } + + first++; + sec++; + } +} + +bool +lexbor_str_data_cmp_ws(const lxb_char_t *first, const lxb_char_t *sec) +{ + for (;;) { + if (*first != *sec) { + return false; + } + + if (lexbor_utils_whitespace(*first, ==, ||) || *first == '\0') { + return true; + } + + first++; + sec++; + } +} + +void +lexbor_str_data_to_lowercase(lxb_char_t *to, const lxb_char_t *from, size_t len) +{ + while (len) { + len--; + + to[len] = lexbor_str_res_map_lowercase[ from[len] ]; + } +} + +void +lexbor_str_data_to_uppercase(lxb_char_t *to, const lxb_char_t *from, size_t len) +{ + while (len) { + len--; + + to[len] = lexbor_str_res_map_uppercase[ from[len] ]; + } +} + +const lxb_char_t * +lexbor_str_data_find_lowercase(const lxb_char_t *data, size_t len) +{ + while (len) { + len--; + + if (data[len] == lexbor_str_res_map_lowercase[ data[len] ]) { + return &data[len]; + } + } + + return NULL; +} + +const lxb_char_t * +lexbor_str_data_find_uppercase(const lxb_char_t *data, size_t len) +{ + while (len) { + len--; + + if (data[len] == lexbor_str_res_map_uppercase[ data[len] ]) { + return &data[len]; + } + } + + return NULL; +} + +/* + * No inline functions for ABI. + */ +lxb_char_t * +lexbor_str_data_noi(lexbor_str_t *str) +{ + return lexbor_str_data(str); +} + +size_t +lexbor_str_length_noi(lexbor_str_t *str) +{ + return lexbor_str_length(str); +} + +size_t +lexbor_str_size_noi(lexbor_str_t *str) +{ + return lexbor_str_size(str); +} + +void +lexbor_str_data_set_noi(lexbor_str_t *str, lxb_char_t *data) +{ + lexbor_str_data_set(str, data); +} + +lxb_char_t * +lexbor_str_length_set_noi(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t length) +{ + return lexbor_str_length_set(str, mraw, length); +} diff --git a/contrib/url/lexbor/core/str.h b/contrib/url/lexbor/core/str.h new file mode 100644 index 0000000000..3bee145373 --- /dev/null +++ b/contrib/url/lexbor/core/str.h @@ -0,0 +1,252 @@ +/* + * Copyright (C) 2018-2023 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_STR_H +#define LEXBOR_STR_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" +#include "lexbor/core/mraw.h" +#include "lexbor/core/utils.h" + + +#define lexbor_str_get(str, attr) str->attr +#define lexbor_str_set(str, attr) lexbor_str_get(str, attr) +#define lexbor_str_len(str) lexbor_str_get(str, length) +#define lexbor_str(p) {.data = (lxb_char_t *) (p), sizeof(p) - 1} + + +#define lexbor_str_check_size_arg_m(str, size, mraw, plus_len, return_fail) \ + do { \ + void *tmp; \ + \ + if (str->length > (SIZE_MAX - (plus_len))) \ + return (return_fail); \ + \ + if ((str->length + (plus_len)) > (size)) { \ + tmp = lexbor_mraw_realloc(mraw, str->data, \ + (str->length + plus_len)); \ + \ + if (tmp == NULL) { \ + return (return_fail); \ + } \ + \ + str->data = (lxb_char_t *) tmp; \ + } \ + } \ + while (0) + + +typedef struct { + lxb_char_t *data; + size_t length; +} +lexbor_str_t; + + +LXB_API lexbor_str_t * +lexbor_str_create(void); + +LXB_API lxb_char_t * +lexbor_str_init(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t size); + +LXB_API lxb_char_t * +lexbor_str_init_append(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *data, size_t length); + +LXB_API void +lexbor_str_clean(lexbor_str_t *str); + +LXB_API void +lexbor_str_clean_all(lexbor_str_t *str); + +LXB_API lexbor_str_t * +lexbor_str_destroy(lexbor_str_t *str, lexbor_mraw_t *mraw, bool destroy_obj); + + +LXB_API lxb_char_t * +lexbor_str_realloc(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t new_size); + +LXB_API lxb_char_t * +lexbor_str_check_size(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t plus_len); + +/* Append */ +LXB_API lxb_char_t * +lexbor_str_append(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *data, size_t length); + +LXB_API lxb_char_t * +lexbor_str_append_before(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *buff, size_t length); + +LXB_API lxb_char_t * +lexbor_str_append_one(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t data); + +LXB_API lxb_char_t * +lexbor_str_append_lowercase(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *data, size_t length); + +LXB_API lxb_char_t * +lexbor_str_append_with_rep_null_chars(lexbor_str_t *str, lexbor_mraw_t *mraw, + const lxb_char_t *buff, size_t length); + +/* Other functions */ +LXB_API lxb_char_t * +lexbor_str_copy(lexbor_str_t *dest, const lexbor_str_t *target, + lexbor_mraw_t *mraw); + +LXB_API void +lexbor_str_stay_only_whitespace(lexbor_str_t *target); + +LXB_API void +lexbor_str_strip_collapse_whitespace(lexbor_str_t *target); + +LXB_API size_t +lexbor_str_crop_whitespace_from_begin(lexbor_str_t *target); + +LXB_API size_t +lexbor_str_whitespace_from_begin(lexbor_str_t *target); + +LXB_API size_t +lexbor_str_whitespace_from_end(lexbor_str_t *target); + + +/* Data utils */ +/* + * [in] first: must be null-terminated + * [in] sec: no matter what data + * [in] sec_size: size of the 'sec' buffer + * + * Function compare two lxb_char_t data until find '\0' in first arg. + * Successfully if the function returned a pointer starting with '\0', + * otherwise, if the data of the second buffer is insufficient function returned + * position in first buffer. + * If function returns NULL, the data are not equal. + */ +LXB_API const lxb_char_t * +lexbor_str_data_ncasecmp_first(const lxb_char_t *first, const lxb_char_t *sec, + size_t sec_size); +LXB_API bool +lexbor_str_data_ncasecmp_end(const lxb_char_t *first, const lxb_char_t *sec, + size_t size); +LXB_API bool +lexbor_str_data_ncasecmp_contain(const lxb_char_t *where, size_t where_size, + const lxb_char_t *what, size_t what_size); +LXB_API bool +lexbor_str_data_ncasecmp(const lxb_char_t *first, const lxb_char_t *sec, + size_t size); +LXB_API bool +lexbor_str_data_nlocmp_right(const lxb_char_t *first, const lxb_char_t *sec, + size_t size); +LXB_API bool +lexbor_str_data_nupcmp_right(const lxb_char_t *first, const lxb_char_t *sec, + size_t size); +LXB_API bool +lexbor_str_data_casecmp(const lxb_char_t *first, const lxb_char_t *sec); + +LXB_API bool +lexbor_str_data_ncmp_end(const lxb_char_t *first, const lxb_char_t *sec, + size_t size); +LXB_API bool +lexbor_str_data_ncmp_contain(const lxb_char_t *where, size_t where_size, + const lxb_char_t *what, size_t what_size); +LXB_API bool +lexbor_str_data_ncmp(const lxb_char_t *first, const lxb_char_t *sec, + size_t size); + +LXB_API bool +lexbor_str_data_cmp(const lxb_char_t *first, const lxb_char_t *sec); + +LXB_API bool +lexbor_str_data_cmp_ws(const lxb_char_t *first, const lxb_char_t *sec); + +LXB_API void +lexbor_str_data_to_lowercase(lxb_char_t *to, const lxb_char_t *from, size_t len); + +LXB_API void +lexbor_str_data_to_uppercase(lxb_char_t *to, const lxb_char_t *from, size_t len); + +LXB_API const lxb_char_t * +lexbor_str_data_find_lowercase(const lxb_char_t *data, size_t len); + +LXB_API const lxb_char_t * +lexbor_str_data_find_uppercase(const lxb_char_t *data, size_t len); + + +/* + * Inline functions + */ +lxb_inline lxb_char_t * +lexbor_str_data(lexbor_str_t *str) +{ + return str->data; +} + +lxb_inline size_t +lexbor_str_length(lexbor_str_t *str) +{ + return str->length; +} + +lxb_inline size_t +lexbor_str_size(lexbor_str_t *str) +{ + return lexbor_mraw_data_size(str->data); +} + +lxb_inline void +lexbor_str_data_set(lexbor_str_t *str, lxb_char_t *data) +{ + str->data = data; +} + +lxb_inline lxb_char_t * +lexbor_str_length_set(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t length) +{ + if (length >= lexbor_str_size(str)) { + lxb_char_t *tmp; + + tmp = lexbor_str_realloc(str, mraw, length + 1); + if (tmp == NULL) { + return NULL; + } + } + + str->length = length; + str->data[length] = 0x00; + + return str->data; +} + +/* + * No inline functions for ABI. + */ +LXB_API lxb_char_t * +lexbor_str_data_noi(lexbor_str_t *str); + +LXB_API size_t +lexbor_str_length_noi(lexbor_str_t *str); + +LXB_API size_t +lexbor_str_size_noi(lexbor_str_t *str); + +LXB_API void +lexbor_str_data_set_noi(lexbor_str_t *str, lxb_char_t *data); + +LXB_API lxb_char_t * +lexbor_str_length_set_noi(lexbor_str_t *str, lexbor_mraw_t *mraw, + size_t length); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_STR_H */ diff --git a/contrib/url/lexbor/core/str_res.h b/contrib/url/lexbor/core/str_res.h new file mode 100644 index 0000000000..cebbda8aa5 --- /dev/null +++ b/contrib/url/lexbor/core/str_res.h @@ -0,0 +1,420 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_STR_RES_H +#define LEXBOR_STR_RES_H + +#define LEXBOR_STR_RES_MAP_CHAR_OTHER '\00' +#define LEXBOR_STR_RES_MAP_CHAR_A_Z_a_z '\01' +#define LEXBOR_STR_RES_MAP_CHAR_WHITESPACE '\02' + +#define LEXBOR_STR_RES_SLIP 0xFF + +#endif /* LEXBOR_STR_RES_H */ + +#ifdef LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER +#ifndef LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER_ENABLED +#define LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER_ENABLED +static const lxb_char_t +lexbor_str_res_ansi_replacement_character[] = "\xEF\xBF\xBD"; +#endif /* LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER_ENABLED */ +#endif /* LEXBOR_STR_RES_ANSI_REPLACEMENT_CHARACTER */ + +#ifdef LEXBOR_STR_RES_MAP_NUM +#ifndef LEXBOR_STR_RES_MAP_NUM_ENABLED +#define LEXBOR_STR_RES_MAP_NUM_ENABLED +static const lxb_char_t lexbor_str_res_map_num[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff +}; +#endif /* LEXBOR_STR_RES_MAP_NUM_ENABLED */ +#endif /* LEXBOR_STR_RES_MAP_NUM */ + + +#ifdef LEXBOR_STR_RES_MAP_HEX +#ifndef LEXBOR_STR_RES_MAP_HEX_ENABLED +#define LEXBOR_STR_RES_MAP_HEX_ENABLED +static const lxb_char_t lexbor_str_res_map_hex[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0a, 0x0b, + 0x0c, 0x0d, 0x0e, 0x0f, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff +}; +#endif /* LEXBOR_STR_RES_MAP_HEX_ENABLED */ +#endif /* LEXBOR_STR_RES_MAP_HEX */ + + +#ifdef LEXBOR_STR_RES_MAP_LOWERCASE +#ifndef LEXBOR_STR_RES_MAP_LOWERCASE_ENABLED +#define LEXBOR_STR_RES_MAP_LOWERCASE_ENABLED +static const lxb_char_t lexbor_str_res_map_lowercase[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, + 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, + 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, + 0x3f, 0x40, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, + 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, + 0x7a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, + 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, + 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, + 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, + 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, + 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, + 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, + 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, + 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, + 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, + 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, + 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, + 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, + 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, + 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, + 0xfc, 0xfd, 0xfe, 0xff +}; +#endif /* LEXBOR_STR_RES_MAP_LOWERCASE_ENABLED */ +#endif /* LEXBOR_STR_RES_MAP_LOWERCASE */ + + +#ifdef LEXBOR_STR_RES_MAP_UPPERCASE +#ifndef LEXBOR_STR_RES_MAP_UPPERCASE_ENABLED +#define LEXBOR_STR_RES_MAP_UPPERCASE_ENABLED +static const lxb_char_t lexbor_str_res_map_uppercase[] = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, + 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, + 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, + 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, + 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, + 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, + 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, + 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, + 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, + 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x41, 0x42, + 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, + 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, + 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x7b, 0x7c, 0x7d, + 0x7e, 0x7f, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, + 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, + 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, + 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, + 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, + 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, + 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, + 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, + 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, + 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, + 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, + 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, + 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, + 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, + 0xfc, 0xfd, 0xfe, 0xff +}; +#endif /* LEXBOR_STR_RES_MAP_UPPERCASE_ENABLED */ +#endif /* LEXBOR_STR_RES_MAP_UPPERCASE */ + + +#ifdef LEXBOR_STR_RES_REPLACEMENT_CHARACTER +#ifndef LEXBOR_STR_RES_REPLACEMENT_CHARACTER_ENABLED +#define LEXBOR_STR_RES_REPLACEMENT_CHARACTER_ENABLED +static const size_t lexbor_str_res_replacement_character[] = { + 65533, 1, 2, 3, 4, 5, 6, 7, 8, + 9, 10, 11, 12, 13, 14, 15, 16, 17, + 18, 19, 20, 21, 22, 23, 24, 25, 26, + 27, 28, 29, 30, 31, 32, 33, 34, 35, + 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 52, 53, + 54, 55, 56, 57, 58, 59, 60, 61, 62, + 63, 64, 65, 66, 67, 68, 69, 70, 71, + 72, 73, 74, 75, 76, 77, 78, 79, 80, + 81, 82, 83, 84, 85, 86, 87, 88, 89, + 90, 91, 92, 93, 94, 95, 96, 97, 98, + 99, 100, 101, 102, 103, 104, 105, 106, 107, + 108, 109, 110, 111, 112, 113, 114, 115, 116, + 117, 118, 119, 120, 121, 122, 123, 124, 125, + 126, 127, 8364, 129, 8218, 402, 8222, 8230, 8224, + 8225, 710, 8240, 352, 8249, 338, 141, 381, 143, + 144, 8216, 8217, 8220, 8221, 8226, 8211, 8212, 732, + 8482, 353, 8250, 339, 157, 382, 376 +}; +#endif /* LEXBOR_STR_RES_REPLACEMENT_CHARACTER_ENABLED */ +#endif /* LEXBOR_STR_RES_REPLACEMENT_CHARACTER */ + + +#ifdef LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER +#ifndef LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER_ENABLED +#define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER_ENABLED +static const size_t lexbor_str_res_alphanumeric_character[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0x08, 0x09, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, + 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff +}; +#endif /* LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER_ENABLED */ +#endif /* LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER */ + + +#ifdef LEXBOR_STR_RES_ALPHA_CHARACTER +#ifndef LEXBOR_STR_RES_ALPHA_CHARACTER_ENABLED +#define LEXBOR_STR_RES_ALPHA_CHARACTER_ENABLED +static const size_t lexbor_str_res_alpha_character[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, + 0x0b, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, + 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, + 0x0e, 0x0f, 0x0a, 0x0b, 0x0c, 0x0d, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff +}; +#endif /* LEXBOR_STR_RES_ALPHA_CHARACTER_ENABLED */ +#endif /* LEXBOR_STR_RES_ALPHA_CHARACTER */ + + +#ifdef LEXBOR_TOKENIZER_CHARS_MAP +#ifndef LEXBOR_TOKENIZER_CHARS_MAP_ENABLED +#define LEXBOR_TOKENIZER_CHARS_MAP_ENABLED +static const unsigned char lexbor_tokenizer_chars_map[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0x02, 0x02, 0xff, 0x02, 0x02, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0x02, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, + 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff +}; +#endif /* LEXBOR_TOKENIZER_CHARS_MAP_ENABLED */ +#endif /* LEXBOR_TOKENIZER_CHARS_MAP */ + + +#ifdef LEXBOR_STR_RES_MAP_HEX_TO_CHAR +#ifndef LEXBOR_STR_RES_MAP_HEX_TO_CHAR_ENABLED +#define LEXBOR_STR_RES_MAP_HEX_TO_CHAR_ENABLED +static const lxb_char_t lexbor_str_res_map_hex_to_char[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x00 +}; +#endif /* LEXBOR_STR_RES_MAP_HEX_TO_CHAR_ENABLED */ +#endif /* LEXBOR_STR_RES_MAP_HEX_TO_CHAR */ + +#ifdef LEXBOR_STR_RES_MAP_HEX_TO_CHAR_LOWERCASE +#ifndef LEXBOR_STR_RES_MAP_HEX_TO_CHAR_LOWERCASE_ENABLED +#define LEXBOR_STR_RES_MAP_HEX_TO_CHAR_LOWERCASE_ENABLED +static const lxb_char_t lexbor_str_res_map_hex_to_char_lowercase[] = { + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, + 0x39, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x00 +}; +#endif /* LEXBOR_STR_RES_MAP_HEX_TO_CHAR_LOWERCASE_ENABLED */ +#endif /* LEXBOR_STR_RES_MAP_HEX_TO_CHAR_LOWERCASE */ + + +#ifdef LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE +#ifndef LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_ENABLED +#define LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_ENABLED +static const char * lexbor_str_res_char_to_two_hex_value[257] = { + "00", "01", "02", "03", "04", "05", "06", "07", + "08", "09", "0A", "0B", "0C", "0D", "0E", "0F", + "10", "11", "12", "13", "14", "15", "16", "17", + "18", "19", "1A", "1B", "1C", "1D", "1E", "1F", + "20", "21", "22", "23", "24", "25", "26", "27", + "28", "29", "2A", "2B", "2C", "2D", "2E", "2F", + "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "3A", "3B", "3C", "3D", "3E", "3F", + "40", "41", "42", "43", "44", "45", "46", "47", + "48", "49", "4A", "4B", "4C", "4D", "4E", "4F", + "50", "51", "52", "53", "54", "55", "56", "57", + "58", "59", "5A", "5B", "5C", "5D", "5E", "5F", + "60", "61", "62", "63", "64", "65", "66", "67", + "68", "69", "6A", "6B", "6C", "6D", "6E", "6F", + "70", "71", "72", "73", "74", "75", "76", "77", + "78", "79", "7A", "7B", "7C", "7D", "7E", "7F", + "80", "81", "82", "83", "84", "85", "86", "87", + "88", "89", "8A", "8B", "8C", "8D", "8E", "8F", + "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99", "9A", "9B", "9C", "9D", "9E", "9F", + "A0", "A1", "A2", "A3", "A4", "A5", "A6", "A7", + "A8", "A9", "AA", "AB", "AC", "AD", "AE", "AF", + "B0", "B1", "B2", "B3", "B4", "B5", "B6", "B7", + "B8", "B9", "BA", "BB", "BC", "BD", "BE", "BF", + "C0", "C1", "C2", "C3", "C4", "C5", "C6", "C7", + "C8", "C9", "CA", "CB", "CC", "CD", "CE", "CF", + "D0", "D1", "D2", "D3", "D4", "D5", "D6", "D7", + "D8", "D9", "DA", "DB", "DC", "DD", "DE", "DF", + "E0", "E1", "E2", "E3", "E4", "E5", "E6", "E7", + "E8", "E9", "EA", "EB", "EC", "ED", "EE", "EF", + "F0", "F1", "F2", "F3", "F4", "F5", "F6", "F7", + "F8", "F9", "FA", "FB", "FC", "FD", "FE", "FF", + NULL +}; +#endif /* LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_ENABLED */ +#endif /* LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE */ + +#ifdef LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_LOWERCASE +#ifndef LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_LOWERCASE_ENABLED +#define LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_LOWERCASE_ENABLED +static const char * lexbor_str_res_char_to_two_hex_value_lowercase[257] = { + "00", "01", "02", "03", "04", "05", "06", "07", + "08", "09", "0a", "0b", "0c", "0d", "0e", "0f", + "10", "11", "12", "13", "14", "15", "16", "17", + "18", "19", "1a", "1b", "1c", "1d", "1e", "1f", + "20", "21", "22", "23", "24", "25", "26", "27", + "28", "29", "2a", "2b", "2c", "2d", "2e", "2f", + "30", "31", "32", "33", "34", "35", "36", "37", + "38", "39", "3a", "3b", "3c", "3d", "3e", "3f", + "40", "41", "42", "43", "44", "45", "46", "47", + "48", "49", "4a", "4b", "4c", "4d", "4e", "4f", + "50", "51", "52", "53", "54", "55", "56", "57", + "58", "59", "5a", "5b", "5c", "5d", "5e", "5f", + "60", "61", "62", "63", "64", "65", "66", "67", + "68", "69", "6a", "6b", "6c", "6d", "6e", "6f", + "70", "71", "72", "73", "74", "75", "76", "77", + "78", "79", "7a", "7b", "7c", "7d", "7e", "7f", + "80", "81", "82", "83", "84", "85", "86", "87", + "88", "89", "8a", "8b", "8c", "8d", "8e", "8f", + "90", "91", "92", "93", "94", "95", "96", "97", + "98", "99", "9a", "9b", "9c", "9d", "9e", "9f", + "a0", "a1", "a2", "a3", "a4", "a5", "a6", "a7", + "a8", "a9", "aa", "ab", "ac", "ad", "ae", "af", + "b0", "b1", "b2", "b3", "b4", "b5", "b6", "b7", + "b8", "b9", "ba", "bb", "bc", "bd", "be", "bf", + "c0", "c1", "c2", "c3", "c4", "c5", "c6", "c7", + "c8", "c9", "ca", "cb", "cc", "cd", "ce", "cf", + "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7", + "d8", "d9", "da", "db", "dc", "dd", "de", "df", + "e0", "e1", "e2", "e3", "e4", "e5", "e6", "e7", + "e8", "e9", "ea", "eb", "ec", "ed", "ee", "ef", + "f0", "f1", "f2", "f3", "f4", "f5", "f6", "f7", + "f8", "f9", "fa", "fb", "fc", "fd", "fe", "ff", + NULL +}; +#endif /* LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_LOWERCASE_ENABLED */ +#endif /* LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE_LOWERCASE */ diff --git a/contrib/url/lexbor/core/strtod.c b/contrib/url/lexbor/core/strtod.c new file mode 100644 index 0000000000..6389fc3156 --- /dev/null +++ b/contrib/url/lexbor/core/strtod.c @@ -0,0 +1,326 @@ +/* + * Copyright (C) Alexander Borisov + * + * Based on nxt_strtod.c from NGINX NJS project + * + * An internal strtod() implementation based upon V8 src/strtod.cc + * without bignum support. + * + * Copyright 2012 the V8 project authors. All rights reserved. + * Use of this source code is governed by a BSD-style license + * that can be found in the LICENSE file. + */ + +#include +#include + +#include "lexbor/core/diyfp.h" +#include "lexbor/core/strtod.h" + + +/* + * Max double: 1.7976931348623157 x 10^308 + * Min non-zero double: 4.9406564584124654 x 10^-324 + * Any x >= 10^309 is interpreted as +infinity. + * Any x <= 10^-324 is interpreted as 0. + * Note that 2.5e-324 (despite being smaller than the min double) + * will be read as non-zero (equal to the min non-zero double). + */ + +#define LEXBOR_DECIMAL_POWER_MAX 309 +#define LEXBOR_DECIMAL_POWER_MIN (-324) + +#define LEXBOR_UINT64_MAX lexbor_uint64_hl(0xFFFFFFFF, 0xFFFFFFFF) +#define LEXBOR_UINT64_DECIMAL_DIGITS_MAX 19 + +#define LEXBOR_DENOM_LOG 3 +#define LEXBOR_DENOM (1 << LEXBOR_DENOM_LOG) + + +static lexbor_diyfp_t +lexbor_strtod_diyfp_read(const lxb_char_t *start, size_t length, + int *remaining); + +static double +lexbor_strtod_diyfp_strtod(const lxb_char_t *start, size_t length, int exp); + + +/* + * Reads digits from the buffer and converts them to a uint64. + * Reads in as many digits as fit into a uint64. + * When the string starts with "1844674407370955161" no further digit is read. + * Since 2^64 = 18446744073709551616 it would still be possible read another + * digit if it was less or equal than 6, but this would complicate the code. + */ +lxb_inline uint64_t +lexbor_strtod_read_uint64(const lxb_char_t *start, size_t length, + size_t *ndigits) +{ + lxb_char_t d; + uint64_t value; + const lxb_char_t *p, *e; + + value = 0; + + p = start; + e = p + length; + + while (p < e && value <= (UINT64_MAX / 10 - 1)) { + d = *p++ - '0'; + value = 10 * value + d; + } + + *ndigits = p - start; + + return value; +} + +/* + * Reads a nxt_diyfp_t from the buffer. + * The returned nxt_diyfp_t is not necessarily normalized. + * If remaining is zero then the returned nxt_diyfp_t is accurate. + * Otherwise it has been rounded and has error of at most 1/2 ulp. + */ +static lexbor_diyfp_t +lexbor_strtod_diyfp_read(const lxb_char_t *start, size_t length, int *remaining) +{ + size_t read; + uint64_t significand; + + significand = lexbor_strtod_read_uint64(start, length, &read); + + /* Round the significand. */ + + if (length != read) { + if (start[read] >= '5') { + significand++; + } + } + + *remaining = (int) (length - read); + + return lexbor_diyfp(significand, 0); +} + +/* + * Returns 10^exp as an exact nxt_diyfp_t. + * The given exp must be in the range [1; NXT_DECIMAL_EXPONENT_DIST[. + */ +lxb_inline lexbor_diyfp_t +lexbor_strtod_adjust_pow10(int exp) +{ + switch (exp) { + case 1: + return lexbor_diyfp(lexbor_uint64_hl(0xa0000000, 00000000), -60); + case 2: + return lexbor_diyfp(lexbor_uint64_hl(0xc8000000, 00000000), -57); + case 3: + return lexbor_diyfp(lexbor_uint64_hl(0xfa000000, 00000000), -54); + case 4: + return lexbor_diyfp(lexbor_uint64_hl(0x9c400000, 00000000), -50); + case 5: + return lexbor_diyfp(lexbor_uint64_hl(0xc3500000, 00000000), -47); + case 6: + return lexbor_diyfp(lexbor_uint64_hl(0xf4240000, 00000000), -44); + case 7: + return lexbor_diyfp(lexbor_uint64_hl(0x98968000, 00000000), -40); + default: + return lexbor_diyfp(0, 0); + } +} + +/* + * Returns the significand size for a given order of magnitude. + * If v = f*2^e with 2^p-1 <= f <= 2^p then p+e is v's order of magnitude. + * This function returns the number of significant binary digits v will have + * once its encoded into a double. In almost all cases this is equal to + * NXT_SIGNIFICAND_SIZE. The only exception are denormals. They start with + * leading zeroes and their effective significand-size is hence smaller. + */ +lxb_inline int +lexbor_strtod_diyfp_sgnd_size(int order) +{ + if (order >= (LEXBOR_DBL_EXPONENT_DENORMAL + LEXBOR_SIGNIFICAND_SIZE)) { + return LEXBOR_SIGNIFICAND_SIZE; + } + + if (order <= LEXBOR_DBL_EXPONENT_DENORMAL) { + return 0; + } + + return order - LEXBOR_DBL_EXPONENT_DENORMAL; +} + +/* + * Returns either the correct double or the double that is just below + * the correct double. + */ +static double +lexbor_strtod_diyfp_strtod(const lxb_char_t *start, size_t length, int exp) +{ + int magnitude, prec_digits; + int remaining, dec_exp, adj_exp, orig_e, shift; + int64_t error; + uint64_t prec_bits, half_way; + lexbor_diyfp_t value, pow, adj_pow, rounded; + + value = lexbor_strtod_diyfp_read(start, length, &remaining); + + exp += remaining; + + /* + * Since some digits may have been dropped the value is not accurate. + * If remaining is different than 0 than the error is at most .5 ulp + * (unit in the last place). + * Using a common denominator to avoid dealing with fractions. + */ + + error = (remaining == 0 ? 0 : LEXBOR_DENOM / 2); + + orig_e = value.exp; + value = lexbor_diyfp_normalize(value); + error <<= orig_e - value.exp; + + if (exp < LEXBOR_DECIMAL_EXPONENT_MIN) { + return 0.0; + } + + pow = lexbor_cached_power_dec(exp, &dec_exp); + + if (dec_exp != exp) { + adj_exp = exp - dec_exp; + adj_pow = lexbor_strtod_adjust_pow10(exp - dec_exp); + value = lexbor_diyfp_mul(value, adj_pow); + + if (LEXBOR_UINT64_DECIMAL_DIGITS_MAX - (int) length < adj_exp) { + /* + * The adjustment power is exact. There is hence only + * an error of 0.5. + */ + error += LEXBOR_DENOM / 2; + } + } + + value = lexbor_diyfp_mul(value, pow); + + /* + * The error introduced by a multiplication of a * b equals + * error_a + error_b + error_a * error_b / 2^64 + 0.5 + * Substituting a with 'value' and b with 'pow': + * error_b = 0.5 (all cached powers have an error of less than 0.5 ulp), + * error_ab = 0 or 1 / NXT_DENOM > error_a * error_b / 2^64. + */ + + error += LEXBOR_DENOM + (error != 0 ? 1 : 0); + + orig_e = value.exp; + value = lexbor_diyfp_normalize(value); + error <<= orig_e - value.exp; + + /* + * Check whether the double's significand changes when the error is added + * or subtracted. + */ + + magnitude = LEXBOR_DIYFP_SIGNIFICAND_SIZE + value.exp; + prec_digits = LEXBOR_DIYFP_SIGNIFICAND_SIZE + - lexbor_strtod_diyfp_sgnd_size(magnitude); + + if (prec_digits + LEXBOR_DENOM_LOG >= LEXBOR_DIYFP_SIGNIFICAND_SIZE) { + /* + * This can only happen for very small denormals. In this case the + * half-way multiplied by the denominator exceeds the range of uint64. + * Simply shift everything to the right. + */ + shift = prec_digits + LEXBOR_DENOM_LOG + - LEXBOR_DIYFP_SIGNIFICAND_SIZE + 1; + + value = lexbor_diyfp_shift_right(value, shift); + + /* + * Add 1 for the lost precision of error, and NXT_DENOM + * for the lost precision of value.significand. + */ + error = (error >> shift) + 1 + LEXBOR_DENOM; + prec_digits -= shift; + } + + prec_bits = value.significand & (((uint64_t) 1 << prec_digits) - 1); + prec_bits *= LEXBOR_DENOM; + + half_way = (uint64_t) 1 << (prec_digits - 1); + half_way *= LEXBOR_DENOM; + + rounded = lexbor_diyfp_shift_right(value, prec_digits); + + if (prec_bits >= half_way + error) { + rounded.significand++; + } + + return lexbor_diyfp_2d(rounded); +} + +double +lexbor_strtod_internal(const lxb_char_t *start, size_t length, int exp) +{ + size_t left, right; + const lxb_char_t *p, *e, *b; + + /* Trim leading zeroes. */ + + p = start; + e = p + length; + + while (p < e) { + if (*p != '0') { + start = p; + break; + } + + p++; + } + + left = e - p; + + /* Trim trailing zeroes. */ + + b = start; + p = b + left - 1; + + while (p > b) { + if (*p != '0') { + break; + } + + p--; + } + + right = p - b + 1; + + length = right; + + if (length == 0) { + return 0.0; + } + + exp += (int) (left - right); + + if (exp + (int) length - 1 >= LEXBOR_DECIMAL_POWER_MAX) { + return INFINITY; + } + + if (exp + (int) length <= LEXBOR_DECIMAL_POWER_MIN) { + return 0.0; + } + + return lexbor_strtod_diyfp_strtod(start, length, exp); +} + +#undef LEXBOR_DECIMAL_POWER_MAX +#undef LEXBOR_DECIMAL_POWER_MIN + +#undef LEXBOR_UINT64_MAX +#undef LEXBOR_UINT64_DECIMAL_DIGITS_MAX + +#undef LEXBOR_DENOM_LOG +#undef LEXBOR_DENOM diff --git a/contrib/url/lexbor/core/strtod.h b/contrib/url/lexbor/core/strtod.h new file mode 100644 index 0000000000..7f15706bc9 --- /dev/null +++ b/contrib/url/lexbor/core/strtod.h @@ -0,0 +1,28 @@ +/* + * Copyright (C) Alexander Borisov + * + * Based on nxt_strtod.h from NGINX NJS project + * + * Copyright (C) Dmitry Volyntsev + * Copyright (C) Nginx, Inc. + */ + +#ifndef LEXBOR_STRTOD_H +#define LEXBOR_STRTOD_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" + + +LXB_API double +lexbor_strtod_internal(const lxb_char_t *start, size_t length, int exp); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_STRTOD_H */ diff --git a/contrib/url/lexbor/core/swar.h b/contrib/url/lexbor/core/swar.h new file mode 100644 index 0000000000..992bf2037b --- /dev/null +++ b/contrib/url/lexbor/core/swar.h @@ -0,0 +1,97 @@ +/* + * Copyright (C) 2024 Alexander Borisov + * + * Author: Niels Dossche + */ + +#ifndef LEXBOR_SWAR_H +#define LEXBOR_SWAR_H + +#ifdef __cplusplus +extern "C" { +#endif + + +#include "lexbor/core/base.h" + + +/* + * Based on techniques from https://graphics.stanford.edu/~seander/bithacks.html + */ +#define LEXBOR_SWAR_ONES (~((size_t) 0) / 0xFF) +#define LEXBOR_SWAR_REPEAT(x) (LEXBOR_SWAR_ONES * (x)) +#define LEXBOR_SWAR_HAS_ZERO(v) (((v) - LEXBOR_SWAR_ONES) & ~(v) & LEXBOR_SWAR_REPEAT(0x80)) +#define LEXBOR_SWAR_IS_LITTLE_ENDIAN (*(unsigned char *) &(uint16_t){1}) + + +/* + * When handling hot loops that search for a set of characters, + * this function can be used to quickly move the data pointer much + * closer to the first occurrence of such a character. + */ +lxb_inline const lxb_char_t * +lexbor_swar_seek4(const lxb_char_t *data, const lxb_char_t *end, + lxb_char_t c1, lxb_char_t c2, lxb_char_t c3, lxb_char_t c4) +{ + size_t bytes, matches, t1, t2, t3, t4; + + if (LEXBOR_SWAR_IS_LITTLE_ENDIAN) { + while (data + sizeof(size_t) <= end) { + memcpy(&bytes, data, sizeof(size_t)); + + t1 = bytes ^ LEXBOR_SWAR_REPEAT(c1); + t2 = bytes ^ LEXBOR_SWAR_REPEAT(c2); + t3 = bytes ^ LEXBOR_SWAR_REPEAT(c3); + t4 = bytes ^ LEXBOR_SWAR_REPEAT(c4); + matches = LEXBOR_SWAR_HAS_ZERO(t1) | LEXBOR_SWAR_HAS_ZERO(t2) + | LEXBOR_SWAR_HAS_ZERO(t3) | LEXBOR_SWAR_HAS_ZERO(t4); + + if (matches) { + data += ((((matches - 1) & LEXBOR_SWAR_ONES) * LEXBOR_SWAR_ONES) + >> (sizeof(size_t) * 8 - 8)) - 1; + break; + } else { + data += sizeof(size_t); + } + } + } + + return data; +} + +lxb_inline const lxb_char_t * +lexbor_swar_seek3(const lxb_char_t *data, const lxb_char_t *end, + lxb_char_t c1, lxb_char_t c2, lxb_char_t c3) +{ + size_t bytes, matches, t1, t2, t3; + + if (LEXBOR_SWAR_IS_LITTLE_ENDIAN) { + while (data + sizeof(size_t) <= end) { + memcpy(&bytes, data, sizeof(size_t)); + + t1 = bytes ^ LEXBOR_SWAR_REPEAT(c1); + t2 = bytes ^ LEXBOR_SWAR_REPEAT(c2); + t3 = bytes ^ LEXBOR_SWAR_REPEAT(c3); + matches = LEXBOR_SWAR_HAS_ZERO(t1) | LEXBOR_SWAR_HAS_ZERO(t2) + | LEXBOR_SWAR_HAS_ZERO(t3); + + if (matches) { + data += ((((matches - 1) & LEXBOR_SWAR_ONES) * LEXBOR_SWAR_ONES) + >> (sizeof(size_t) * 8 - 8)) - 1; + break; + } else { + data += sizeof(size_t); + } + } + } + + return data; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_SWAR_H */ + diff --git a/contrib/url/lexbor/core/types.h b/contrib/url/lexbor/core/types.h new file mode 100644 index 0000000000..00f82ad1e5 --- /dev/null +++ b/contrib/url/lexbor/core/types.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_TYPES_H +#define LEXBOR_TYPES_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include +#include + +/* Inline */ +#ifdef _MSC_VER + #define lxb_inline static __forceinline +#else + #define lxb_inline static inline +#endif + + +/* Simple types */ +typedef uint32_t lxb_codepoint_t; +typedef unsigned char lxb_char_t; +typedef unsigned int lxb_status_t; + +/* Callbacks */ +typedef lxb_status_t (*lexbor_callback_f)(const lxb_char_t *buffer, + size_t size, void *ctx); + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_TYPES_H */ diff --git a/contrib/url/lexbor/core/utils.h b/contrib/url/lexbor/core/utils.h new file mode 100644 index 0000000000..c02c854ce5 --- /dev/null +++ b/contrib/url/lexbor/core/utils.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2018 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_UTILS_H +#define LEXBOR_UTILS_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" + + +#define lexbor_utils_whitespace(onechar, action, logic) \ + (onechar action ' ' logic \ + onechar action '\t' logic \ + onechar action '\n' logic \ + onechar action '\f' logic \ + onechar action '\r') + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_UTILS_H */ diff --git a/contrib/url/lexbor/url/base.h b/contrib/url/lexbor/url/base.h new file mode 100644 index 0000000000..596fb92a0f --- /dev/null +++ b/contrib/url/lexbor/url/base.h @@ -0,0 +1,32 @@ +/* + * Copyright (C) 2023-2024 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#ifndef LEXBOR_URL_BASE_H +#define LEXBOR_URL_BASE_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/core/base.h" +#include "lexbor/core/mraw.h" +#include "lexbor/core/str.h" + + +#define LXB_URL_VERSION_MAJOR 0 +#define LXB_URL_VERSION_MINOR 2 +#define LXB_URL_VERSION_PATCH 0 + +#define LXB_URL_VERSION_STRING LEXBOR_STRINGIZE(LXB_URL_VERSION_MAJOR) "." \ + LEXBOR_STRINGIZE(LXB_URL_VERSION_MINOR) "." \ + LEXBOR_STRINGIZE(LXB_URL_VERSION_PATCH) + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_URL_BASE_H */ diff --git a/contrib/url/lexbor/url/url.c b/contrib/url/lexbor/url/url.c new file mode 100644 index 0000000000..074964c3a4 --- /dev/null +++ b/contrib/url/lexbor/url/url.c @@ -0,0 +1,5049 @@ +/* + * Copyright (C) 2023-2024 Alexander Borisov + * + * Author: Alexander Borisov + */ + +#include +#include +#include +#include +#include + +#include "lexbor/url/url.h" +#include "lexbor/core/conv.h" +#include "lexbor/core/utils.h" +#include "lexbor/core/serialize.h" +#include "lexbor/core/swar.h" + +#define LEXBOR_STR_RES_MAP_LOWERCASE +#define LEXBOR_STR_RES_ALPHANUMERIC_CHARACTER +#define LEXBOR_STR_RES_ALPHA_CHARACTER +#define LEXBOR_STR_RES_CHAR_TO_TWO_HEX_VALUE +#define LEXBOR_STR_RES_MAP_HEX +#define LEXBOR_STR_RES_MAP_NUM +#include "lexbor/core/str_res.h" + + +#define LXB_URL_BUFFER_SIZE 4096 +#define LXB_URL_BUFFER_NUM_SIZE 128 +#define LXB_URL_DECODE_ERROR 0x1FFFFF + + +typedef enum { + LXB_URL_MAP_UNDEF = 0x00, + LXB_URL_MAP_C0 = 0x01, + LXB_URL_MAP_FRAGMENT = 0x02, + LXB_URL_MAP_QUERY = 0x04, + LXB_URL_MAP_SPECIAL_QUERY = 0x08, + LXB_URL_MAP_PATH = 0x10, + LXB_URL_MAP_USERINFO = 0x20, + LXB_URL_MAP_COMPONENT = 0x40, + LXB_URL_MAP_ALL = 0xff +} +lxb_url_map_type_t; + +typedef enum { + LXB_URL_HOST_OPT_UNDEF = 0 << 0, + LXB_URL_HOST_OPT_NOT_SPECIAL = 1 << 0, + LXB_URL_HOST_OPT_DECODE = 1 << 1, + LXB_URL_HOST_OPT_IDNA = 1 << 2 +} +lxb_url_host_opt_t; + +typedef struct { + lexbor_mraw_t *mraw; + lexbor_str_t *str; +} +lxb_url_idna_ctx_t; + + +static const uint8_t lxb_url_map[256] = +{ + LXB_URL_MAP_ALL, /* 0x00 */ + LXB_URL_MAP_ALL, /* 0x01 */ + LXB_URL_MAP_ALL, /* 0x02 */ + LXB_URL_MAP_ALL, /* 0x03 */ + LXB_URL_MAP_ALL, /* 0x04 */ + LXB_URL_MAP_ALL, /* 0x05 */ + LXB_URL_MAP_ALL, /* 0x06 */ + LXB_URL_MAP_ALL, /* 0x07 */ + LXB_URL_MAP_ALL, /* 0x08 */ + LXB_URL_MAP_ALL, /* 0x09 */ + LXB_URL_MAP_ALL, /* 0x0a */ + LXB_URL_MAP_ALL, /* 0x0b */ + LXB_URL_MAP_ALL, /* 0x0c */ + LXB_URL_MAP_ALL, /* 0x0d */ + LXB_URL_MAP_ALL, /* 0x0e */ + LXB_URL_MAP_ALL, /* 0x0f */ + LXB_URL_MAP_ALL, /* 0x10 */ + LXB_URL_MAP_ALL, /* 0x11 */ + LXB_URL_MAP_ALL, /* 0x12 */ + LXB_URL_MAP_ALL, /* 0x13 */ + LXB_URL_MAP_ALL, /* 0x14 */ + LXB_URL_MAP_ALL, /* 0x15 */ + LXB_URL_MAP_ALL, /* 0x16 */ + LXB_URL_MAP_ALL, /* 0x17 */ + LXB_URL_MAP_ALL, /* 0x18 */ + LXB_URL_MAP_ALL, /* 0x19 */ + LXB_URL_MAP_ALL, /* 0x1a */ + LXB_URL_MAP_ALL, /* 0x1b */ + LXB_URL_MAP_ALL, /* 0x1c */ + LXB_URL_MAP_ALL, /* 0x1d */ + LXB_URL_MAP_ALL, /* 0x1e */ + LXB_URL_MAP_ALL, /* 0x1f */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_FRAGMENT|LXB_URL_MAP_QUERY|LXB_URL_MAP_SPECIAL_QUERY|LXB_URL_MAP_PATH|LXB_URL_MAP_COMPONENT, /* 0x20 ( ) */ + LXB_URL_MAP_UNDEF, /* 0x21 (!) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_FRAGMENT|LXB_URL_MAP_QUERY|LXB_URL_MAP_SPECIAL_QUERY|LXB_URL_MAP_PATH|LXB_URL_MAP_COMPONENT, /* 0x22 (") */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_QUERY|LXB_URL_MAP_SPECIAL_QUERY|LXB_URL_MAP_PATH|LXB_URL_MAP_COMPONENT, /* 0x23 (#) */ + LXB_URL_MAP_COMPONENT, /* 0x24 ($) */ + LXB_URL_MAP_UNDEF, /* 0x25 (%) */ + LXB_URL_MAP_COMPONENT, /* 0x26 (&) */ + LXB_URL_MAP_SPECIAL_QUERY, /* 0x27 (') */ + LXB_URL_MAP_UNDEF, /* 0x28 (() */ + LXB_URL_MAP_UNDEF, /* 0x29 ()) */ + LXB_URL_MAP_UNDEF, /* 0x2a (*) */ + LXB_URL_MAP_COMPONENT, /* 0x2b (+) */ + LXB_URL_MAP_COMPONENT, /* 0x2c (,) */ + LXB_URL_MAP_UNDEF, /* 0x2d (-) */ + LXB_URL_MAP_UNDEF, /* 0x2e (.) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x2f (/) */ + LXB_URL_MAP_UNDEF, /* 0x30 (0) */ + LXB_URL_MAP_UNDEF, /* 0x31 (1) */ + LXB_URL_MAP_UNDEF, /* 0x32 (2) */ + LXB_URL_MAP_UNDEF, /* 0x33 (3) */ + LXB_URL_MAP_UNDEF, /* 0x34 (4) */ + LXB_URL_MAP_UNDEF, /* 0x35 (5) */ + LXB_URL_MAP_UNDEF, /* 0x36 (6) */ + LXB_URL_MAP_UNDEF, /* 0x37 (7) */ + LXB_URL_MAP_UNDEF, /* 0x38 (8) */ + LXB_URL_MAP_UNDEF, /* 0x39 (9) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x3a (:) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x3b (;) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_FRAGMENT|LXB_URL_MAP_QUERY|LXB_URL_MAP_SPECIAL_QUERY|LXB_URL_MAP_PATH|LXB_URL_MAP_COMPONENT, /* 0x3c (<) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x3d (=) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_FRAGMENT|LXB_URL_MAP_QUERY|LXB_URL_MAP_SPECIAL_QUERY|LXB_URL_MAP_PATH|LXB_URL_MAP_COMPONENT, /* 0x3e (>) */ + LXB_URL_MAP_PATH|LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x3f (?) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x40 (@) */ + LXB_URL_MAP_UNDEF, /* 0x41 (A) */ + LXB_URL_MAP_UNDEF, /* 0x42 (B) */ + LXB_URL_MAP_UNDEF, /* 0x43 (C) */ + LXB_URL_MAP_UNDEF, /* 0x44 (D) */ + LXB_URL_MAP_UNDEF, /* 0x45 (E) */ + LXB_URL_MAP_UNDEF, /* 0x46 (F) */ + LXB_URL_MAP_UNDEF, /* 0x47 (G) */ + LXB_URL_MAP_UNDEF, /* 0x48 (H) */ + LXB_URL_MAP_UNDEF, /* 0x49 (I) */ + LXB_URL_MAP_UNDEF, /* 0x4a (J) */ + LXB_URL_MAP_UNDEF, /* 0x4b (K) */ + LXB_URL_MAP_UNDEF, /* 0x4c (L) */ + LXB_URL_MAP_UNDEF, /* 0x4d (M) */ + LXB_URL_MAP_UNDEF, /* 0x4e (N) */ + LXB_URL_MAP_UNDEF, /* 0x4f (O) */ + LXB_URL_MAP_UNDEF, /* 0x50 (P) */ + LXB_URL_MAP_UNDEF, /* 0x51 (Q) */ + LXB_URL_MAP_UNDEF, /* 0x52 (R) */ + LXB_URL_MAP_UNDEF, /* 0x53 (S) */ + LXB_URL_MAP_UNDEF, /* 0x54 (T) */ + LXB_URL_MAP_UNDEF, /* 0x55 (U) */ + LXB_URL_MAP_UNDEF, /* 0x56 (V) */ + LXB_URL_MAP_UNDEF, /* 0x57 (W) */ + LXB_URL_MAP_UNDEF, /* 0x58 (X) */ + LXB_URL_MAP_UNDEF, /* 0x59 (Y) */ + LXB_URL_MAP_UNDEF, /* 0x5a (Z) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x5b ([) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x5c (\) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x5d (]) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x5e (^) */ + LXB_URL_MAP_UNDEF, /* 0x5f (_) */ + LXB_URL_MAP_PATH|LXB_URL_MAP_FRAGMENT|LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x60 (`) */ + LXB_URL_MAP_UNDEF, /* 0x61 (a) */ + LXB_URL_MAP_UNDEF, /* 0x62 (b) */ + LXB_URL_MAP_UNDEF, /* 0x63 (c) */ + LXB_URL_MAP_UNDEF, /* 0x64 (d) */ + LXB_URL_MAP_UNDEF, /* 0x65 (e) */ + LXB_URL_MAP_UNDEF, /* 0x66 (f) */ + LXB_URL_MAP_UNDEF, /* 0x67 (g) */ + LXB_URL_MAP_UNDEF, /* 0x68 (h) */ + LXB_URL_MAP_UNDEF, /* 0x69 (i) */ + LXB_URL_MAP_UNDEF, /* 0x6a (j) */ + LXB_URL_MAP_UNDEF, /* 0x6b (k) */ + LXB_URL_MAP_UNDEF, /* 0x6c (l) */ + LXB_URL_MAP_UNDEF, /* 0x6d (m) */ + LXB_URL_MAP_UNDEF, /* 0x6e (n) */ + LXB_URL_MAP_UNDEF, /* 0x6f (o) */ + LXB_URL_MAP_UNDEF, /* 0x70 (p) */ + LXB_URL_MAP_UNDEF, /* 0x71 (q) */ + LXB_URL_MAP_UNDEF, /* 0x72 (r) */ + LXB_URL_MAP_UNDEF, /* 0x73 (s) */ + LXB_URL_MAP_UNDEF, /* 0x74 (t) */ + LXB_URL_MAP_UNDEF, /* 0x75 (u) */ + LXB_URL_MAP_UNDEF, /* 0x76 (v) */ + LXB_URL_MAP_UNDEF, /* 0x77 (w) */ + LXB_URL_MAP_UNDEF, /* 0x78 (x) */ + LXB_URL_MAP_UNDEF, /* 0x79 (y) */ + LXB_URL_MAP_UNDEF, /* 0x7a (z) */ + LXB_URL_MAP_PATH|LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x7b ({) */ + LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x7c (|) */ + LXB_URL_MAP_PATH|LXB_URL_MAP_USERINFO|LXB_URL_MAP_COMPONENT, /* 0x7d (}) */ + LXB_URL_MAP_UNDEF, /* 0x7e (~) */ + LXB_URL_MAP_ALL, /* 0x7f */ + LXB_URL_MAP_ALL, /* 0x80 */ + LXB_URL_MAP_ALL, /* 0x81 */ + LXB_URL_MAP_ALL, /* 0x82 */ + LXB_URL_MAP_ALL, /* 0x83 */ + LXB_URL_MAP_ALL, /* 0x84 */ + LXB_URL_MAP_ALL, /* 0x85 */ + LXB_URL_MAP_ALL, /* 0x86 */ + LXB_URL_MAP_ALL, /* 0x87 */ + LXB_URL_MAP_ALL, /* 0x88 */ + LXB_URL_MAP_ALL, /* 0x89 */ + LXB_URL_MAP_ALL, /* 0x8a */ + LXB_URL_MAP_ALL, /* 0x8b */ + LXB_URL_MAP_ALL, /* 0x8c */ + LXB_URL_MAP_ALL, /* 0x8d */ + LXB_URL_MAP_ALL, /* 0x8e */ + LXB_URL_MAP_ALL, /* 0x8f */ + LXB_URL_MAP_ALL, /* 0x90 */ + LXB_URL_MAP_ALL, /* 0x91 */ + LXB_URL_MAP_ALL, /* 0x92 */ + LXB_URL_MAP_ALL, /* 0x93 */ + LXB_URL_MAP_ALL, /* 0x94 */ + LXB_URL_MAP_ALL, /* 0x95 */ + LXB_URL_MAP_ALL, /* 0x96 */ + LXB_URL_MAP_ALL, /* 0x97 */ + LXB_URL_MAP_ALL, /* 0x98 */ + LXB_URL_MAP_ALL, /* 0x99 */ + LXB_URL_MAP_ALL, /* 0x9a */ + LXB_URL_MAP_ALL, /* 0x9b */ + LXB_URL_MAP_ALL, /* 0x9c */ + LXB_URL_MAP_ALL, /* 0x9d */ + LXB_URL_MAP_ALL, /* 0x9e */ + LXB_URL_MAP_ALL, /* 0x9f */ + LXB_URL_MAP_ALL, /* 0xa0 */ + LXB_URL_MAP_ALL, /* 0xa1 */ + LXB_URL_MAP_ALL, /* 0xa2 */ + LXB_URL_MAP_ALL, /* 0xa3 */ + LXB_URL_MAP_ALL, /* 0xa4 */ + LXB_URL_MAP_ALL, /* 0xa5 */ + LXB_URL_MAP_ALL, /* 0xa6 */ + LXB_URL_MAP_ALL, /* 0xa7 */ + LXB_URL_MAP_ALL, /* 0xa8 */ + LXB_URL_MAP_ALL, /* 0xa9 */ + LXB_URL_MAP_ALL, /* 0xaa */ + LXB_URL_MAP_ALL, /* 0xab */ + LXB_URL_MAP_ALL, /* 0xac */ + LXB_URL_MAP_ALL, /* 0xad */ + LXB_URL_MAP_ALL, /* 0xae */ + LXB_URL_MAP_ALL, /* 0xaf */ + LXB_URL_MAP_ALL, /* 0xb0 */ + LXB_URL_MAP_ALL, /* 0xb1 */ + LXB_URL_MAP_ALL, /* 0xb2 */ + LXB_URL_MAP_ALL, /* 0xb3 */ + LXB_URL_MAP_ALL, /* 0xb4 */ + LXB_URL_MAP_ALL, /* 0xb5 */ + LXB_URL_MAP_ALL, /* 0xb6 */ + LXB_URL_MAP_ALL, /* 0xb7 */ + LXB_URL_MAP_ALL, /* 0xb8 */ + LXB_URL_MAP_ALL, /* 0xb9 */ + LXB_URL_MAP_ALL, /* 0xba */ + LXB_URL_MAP_ALL, /* 0xbb */ + LXB_URL_MAP_ALL, /* 0xbc */ + LXB_URL_MAP_ALL, /* 0xbd */ + LXB_URL_MAP_ALL, /* 0xbe */ + LXB_URL_MAP_ALL, /* 0xbf */ + LXB_URL_MAP_ALL, /* 0xc0 */ + LXB_URL_MAP_ALL, /* 0xc1 */ + LXB_URL_MAP_ALL, /* 0xc2 */ + LXB_URL_MAP_ALL, /* 0xc3 */ + LXB_URL_MAP_ALL, /* 0xc4 */ + LXB_URL_MAP_ALL, /* 0xc5 */ + LXB_URL_MAP_ALL, /* 0xc6 */ + LXB_URL_MAP_ALL, /* 0xc7 */ + LXB_URL_MAP_ALL, /* 0xc8 */ + LXB_URL_MAP_ALL, /* 0xc9 */ + LXB_URL_MAP_ALL, /* 0xca */ + LXB_URL_MAP_ALL, /* 0xcb */ + LXB_URL_MAP_ALL, /* 0xcc */ + LXB_URL_MAP_ALL, /* 0xcd */ + LXB_URL_MAP_ALL, /* 0xce */ + LXB_URL_MAP_ALL, /* 0xcf */ + LXB_URL_MAP_ALL, /* 0xd0 */ + LXB_URL_MAP_ALL, /* 0xd1 */ + LXB_URL_MAP_ALL, /* 0xd2 */ + LXB_URL_MAP_ALL, /* 0xd3 */ + LXB_URL_MAP_ALL, /* 0xd4 */ + LXB_URL_MAP_ALL, /* 0xd5 */ + LXB_URL_MAP_ALL, /* 0xd6 */ + LXB_URL_MAP_ALL, /* 0xd7 */ + LXB_URL_MAP_ALL, /* 0xd8 */ + LXB_URL_MAP_ALL, /* 0xd9 */ + LXB_URL_MAP_ALL, /* 0xda */ + LXB_URL_MAP_ALL, /* 0xdb */ + LXB_URL_MAP_ALL, /* 0xdc */ + LXB_URL_MAP_ALL, /* 0xdd */ + LXB_URL_MAP_ALL, /* 0xde */ + LXB_URL_MAP_ALL, /* 0xdf */ + LXB_URL_MAP_ALL, /* 0xe0 */ + LXB_URL_MAP_ALL, /* 0xe1 */ + LXB_URL_MAP_ALL, /* 0xe2 */ + LXB_URL_MAP_ALL, /* 0xe3 */ + LXB_URL_MAP_ALL, /* 0xe4 */ + LXB_URL_MAP_ALL, /* 0xe5 */ + LXB_URL_MAP_ALL, /* 0xe6 */ + LXB_URL_MAP_ALL, /* 0xe7 */ + LXB_URL_MAP_ALL, /* 0xe8 */ + LXB_URL_MAP_ALL, /* 0xe9 */ + LXB_URL_MAP_ALL, /* 0xea */ + LXB_URL_MAP_ALL, /* 0xeb */ + LXB_URL_MAP_ALL, /* 0xec */ + LXB_URL_MAP_ALL, /* 0xed */ + LXB_URL_MAP_ALL, /* 0xee */ + LXB_URL_MAP_ALL, /* 0xef */ + LXB_URL_MAP_ALL, /* 0xf0 */ + LXB_URL_MAP_ALL, /* 0xf1 */ + LXB_URL_MAP_ALL, /* 0xf2 */ + LXB_URL_MAP_ALL, /* 0xf3 */ + LXB_URL_MAP_ALL, /* 0xf4 */ + LXB_URL_MAP_ALL, /* 0xf5 */ + LXB_URL_MAP_ALL, /* 0xf6 */ + LXB_URL_MAP_ALL, /* 0xf7 */ + LXB_URL_MAP_ALL, /* 0xf8 */ + LXB_URL_MAP_ALL, /* 0xf9 */ + LXB_URL_MAP_ALL, /* 0xfa */ + LXB_URL_MAP_ALL, /* 0xfb */ + LXB_URL_MAP_ALL, /* 0xfc */ + LXB_URL_MAP_ALL, /* 0xfd */ + LXB_URL_MAP_ALL, /* 0xfe */ + LXB_URL_MAP_ALL, /* 0xff */ +}; + +/* + * U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR, U+0020 SPACE, U+0023 (#), + * U+002F (/), U+003A (:), U+003C (<), U+003E (>), U+003F (?), U+0040 (@), + * U+005B ([), U+005C (\), U+005D (]), U+005E (^), or U+007C (|). + * U+0000 NULL to U+001F, U+0025 (%), or U+007F DELETE. + */ +static const lxb_char_t lxb_url_map_forbidden_domain_cp[128] = +{ + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, + 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, + 0x1e, 0x1f, 0x20, 0xff, 0xff, 0x23, 0xff, 0x25, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x2f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3a, 0xff, + 0x3c, 0xff, 0x3e, 0x3f, 0x40, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x5b, 0x5c, 0x5d, 0x5e, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x7c, 0xff, 0xff, 0x7f +}; + +/* + * U+0000 NULL, U+0009 TAB, U+000A LF, U+000D CR, U+0020 SPACE, U+0023 (#), + * U+002F (/), U+003A (:), U+003C (<), U+003E (>), U+003F (?), U+0040 (@), + * U+005B ([), U+005C (\), U+005D (]), U+005E (^), or U+007C (|). + */ +static const lxb_char_t lxb_url_map_forbidden_host_cp[128] = +{ + 0x00, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x09, + 0x0a, 0xff, 0xff, 0x0d, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0x20, 0xff, 0xff, 0x23, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x2f, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x3a, 0xff, + 0x3c, 0xff, 0x3e, 0x3f, 0x40, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x5b, 0x5c, 0x5d, 0x5e, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0x7c, 0xff, 0xff, 0xff +}; + +static const lxb_char_t lxb_url_map_num_8[] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, + 0x06, 0x07, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff +}; + +static const lxb_char_t lxb_url_codepoint_alphanumeric[0xA0] = { + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0x21, 0xff, 0xff, 0x24, 0xff, 0x26, 0x27, + 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, + 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, + 0x38, 0x39, 0x3a, 0x3b, 0xff, 0x3d, 0xff, 0x3f, + 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, + 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, + 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, + 0x58, 0x59, 0x5a, 0xff, 0xff, 0xff, 0xff, 0x5f, + 0xff, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, + 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, + 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, + 0x78, 0x79, 0x7a, 0xff, 0xff, 0xff, 0x7e, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, + 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff +}; + +static const lxb_char_t lxb_url_path_map[256] = +{ + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x01, 0x00, 0x01, 0x01, 0x00, 0x04, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x01, 0x01, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x02, 0x01, 0x02, 0x02, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x01, 0x02, 0x01, 0x00, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, + 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02 +}; + +static const lxb_url_scheme_data_t +lxb_url_scheme_res[LXB_URL_SCHEMEL_TYPE__LAST_ENTRY] = +{ + {.name = lexbor_str("#undef"), 0, LXB_URL_SCHEMEL_TYPE__UNDEF }, + {.name = lexbor_str("#unknown"), 0, LXB_URL_SCHEMEL_TYPE__UNKNOWN}, + {.name = lexbor_str("http"), 80, LXB_URL_SCHEMEL_TYPE_HTTP }, + {.name = lexbor_str("https"), 443, LXB_URL_SCHEMEL_TYPE_HTTPS }, + {.name = lexbor_str("ws"), 80, LXB_URL_SCHEMEL_TYPE_WS }, + {.name = lexbor_str("wss"), 443, LXB_URL_SCHEMEL_TYPE_WSS }, + {.name = lexbor_str("ftp"), 21, LXB_URL_SCHEMEL_TYPE_FTP }, + {.name = lexbor_str("file"), 0, LXB_URL_SCHEMEL_TYPE_FILE } +}; + +static const size_t +lxb_url_scheme_res_length = sizeof(lxb_url_scheme_res) / sizeof(lxb_url_scheme_data_t); + + +#define lxb_url_parse_return(data, buf, status) \ + do { \ + if ((buf) != (data)) { \ + lexbor_free((lxb_char_t *) (buf)); \ + } \ + return (status); \ + } \ + while (false) + +#define LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, sbuf_end, sbuffer, last) \ + do { \ + size_t new_len, offset, lst; \ + lxb_char_t *tmp; \ + \ + lst = (last) - (sbuf_begin); \ + offset = (sbuf) - (sbuf_begin); \ + new_len = offset << 1; \ + \ + if ((sbuf_begin) == (sbuffer)) { \ + tmp = lexbor_malloc(new_len); \ + if (tmp == NULL) { \ + return NULL; \ + } \ + } \ + else { \ + tmp = lexbor_realloc((sbuf_begin), new_len); \ + if (tmp == NULL) { \ + lexbor_free(sbuf_begin); \ + return NULL; \ + } \ + } \ + \ + (sbuf) = tmp + offset; \ + (last) = sbuf + lst; \ + (sbuf_begin) = tmp; \ + (sbuf_end) = tmp + new_len; \ + } \ + while (false) + +#define lxb_url_is_windows_letter(data) \ + (((data) >= 'a' && (data) <= 'z') || ((data) >= 'A' && (data) <= 'Z')) + + +static lxb_status_t +lxb_url_leading_trailing(lxb_url_parser_t *parser, + const lxb_char_t **data, size_t *length); + +static const lxb_char_t * +lxb_url_remove_tab_newline(lxb_url_parser_t *parser, + const lxb_char_t *data, size_t *length); + +static const lxb_url_scheme_data_t * +lxb_url_scheme_find(const lxb_char_t *data, size_t length); + +static lxb_status_t +lxb_url_parse_basic_h(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_url_t *base_url, + const lxb_char_t *data, size_t length, + lxb_url_state_t override_state); + +static const lxb_char_t * +lxb_url_path_fast_path(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_char_t *data, const lxb_char_t *end, bool bqs); + +const lxb_char_t * +lxb_url_path_slow_path(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_char_t *data, const lxb_char_t *end, bool bqs); + +static lxb_status_t +lxb_url_path_try_dot(lxb_url_t *url, const lxb_char_t **begin, + const lxb_char_t **last, const lxb_char_t **start, + const lxb_char_t *end, bool bqs); + +static const lxb_char_t * +lxb_url_path_dot_count(lxb_url_t *url, const lxb_char_t *p, + const lxb_char_t *end, const lxb_char_t *sbuf_begin, + lxb_char_t **sbuf, lxb_char_t **last, size_t *path_count, + bool bqs); + +static void +lxb_url_path_fix_windows_drive(lxb_url_t *url, lxb_char_t *sbuf, + const lxb_char_t *last, size_t count); + +static lxb_status_t +lxb_url_percent_encode_after_utf_8(const lxb_char_t *data, + const lxb_char_t *end, lexbor_str_t *str, + lexbor_mraw_t *mraw, + lxb_url_map_type_t enmap, + bool space_as_plus); + +static lxb_status_t +lxb_url_host_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, lxb_url_host_t *host, + lexbor_mraw_t *mraw, lxb_url_host_opt_t opt); + +static lxb_status_t +lxb_url_ipv4_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, uint32_t *ipv6); + +static lxb_status_t +lxb_url_ipv4_number_parse(const lxb_char_t *data, + const lxb_char_t *end, uint64_t *num); + +static bool +lxb_url_is_ipv4(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end); + +static lxb_status_t +lxb_url_ipv6_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, uint16_t *ipv6); + +static lxb_status_t +lxb_url_ipv4_in_ipv6_parse(lxb_url_parser_t *parser, const lxb_char_t **data, + const lxb_char_t *end, uint16_t **pieces); + +static lxb_status_t +lxb_url_opaque_host_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, lxb_url_host_t *host, + lexbor_mraw_t *mraw); + +static lxb_status_t +lxb_url_percent_decode(const lxb_char_t *data, const lxb_char_t *end, + lexbor_str_t *str, lexbor_mraw_t *mraw, + lxb_url_host_opt_t *opt); + +static const lxb_char_t * +lxb_url_path_part_by_index(const lxb_url_t *url, size_t index, + size_t *out_length); + +static lxb_status_t +lxb_url_host_set_h(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *host, size_t length, + lxb_url_state_t override_state); + +static lxb_status_t +lxb_url_callback_length(const lxb_char_t *data, size_t len, void *ctx); + +static lxb_codepoint_t +lxb_url_decode_valid_utf_8_single(const lxb_char_t **data, + const lxb_char_t *end); +static lxb_codepoint_t +lxb_url_decode_valid_utf_8_single_reverse(const lxb_char_t **end, + const lxb_char_t *begin); + + +lxb_url_parser_t * +lxb_url_parser_create(void) +{ + return lexbor_calloc(1, sizeof(lxb_url_parser_t)); +} + +lxb_status_t +lxb_url_parser_init(lxb_url_parser_t *parser, lexbor_mraw_t *mraw) +{ + bool itmy; + lxb_status_t status; + + if (parser == NULL) { + return LXB_STATUS_ERROR_OBJECT_IS_NULL; + } + + itmy = false; + + if (mraw == NULL) { + mraw = lexbor_mraw_create(); + status = lexbor_mraw_init(mraw, LXB_URL_BUFFER_SIZE); + if (status != LXB_STATUS_OK) { + goto failed; + } + + itmy = true; + } + + parser->mraw = mraw; + parser->log = NULL; + + return LXB_STATUS_OK; + +failed: + + if (itmy) { + (void) lexbor_mraw_destroy(mraw, true); + } + + memset(parser, 0x00, sizeof(lxb_url_parser_t)); + + return status; +} + +void +lxb_url_parser_clean(lxb_url_parser_t *parser) +{ + parser->url = NULL; + + if (parser->log != NULL) { + lexbor_plog_clean(parser->log); + } +} + +lxb_url_parser_t * +lxb_url_parser_destroy(lxb_url_parser_t *parser, bool destroy_self) +{ + if (parser == NULL) { + return NULL; + } + + parser->log = lexbor_plog_destroy(parser->log, true); + + if (destroy_self) { + return lexbor_free(parser); + } + + return parser; +} + +void +lxb_url_parser_memory_destroy(lxb_url_parser_t *parser) +{ + parser->mraw = lexbor_mraw_destroy(parser->mraw, true); +} + +static lxb_status_t +lxb_url_log_append(lxb_url_parser_t *parser, const lxb_char_t *pos, + lxb_url_error_type_t type) +{ + void *entry; + lxb_status_t status; + + if (parser->log == NULL) { + parser->log = lexbor_plog_create(); + status = lexbor_plog_init(parser->log, 5, sizeof(lexbor_plog_entry_t)); + if (status != LXB_STATUS_OK) { + return status; + } + } + + entry = lexbor_plog_push(parser->log, pos, NULL, type); + if (entry == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + return LXB_STATUS_OK; +} + +static lxb_status_t +lxb_url_str_init(lexbor_str_t *str, lexbor_mraw_t *mraw, size_t length) +{ + size_t size; + const lxb_char_t *p; + + if (str->data == NULL) { + p = lexbor_str_init(str, mraw, length); + if (p == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + else { + size = str->length + length; + + if (size > lexbor_str_size(str)) { + p = lexbor_str_realloc(str, mraw, size); + if (p == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + } + + return LXB_STATUS_OK; +} + +static lxb_status_t +lxb_url_str_copy(const lexbor_str_t *src, lexbor_str_t *dst, + lexbor_mraw_t *dst_mraw) +{ + if (src->data == NULL) { + *dst = *src; + return LXB_STATUS_OK; + } + + if (dst->data != NULL) { + if (dst->length >= src->length) { + /* +1 == '\0' */ + memcpy(dst->data, src->data, src->length + 1); + + return LXB_STATUS_OK; + } + + (void) lexbor_str_destroy(dst, dst_mraw, false); + } + + (void) lexbor_str_init_append(dst, dst_mraw, + src->data, src->length); + + return (dst->data != NULL) ? LXB_STATUS_OK + : LXB_STATUS_ERROR_MEMORY_ALLOCATION; +} + +lxb_inline bool +lxb_url_is_noncharacter(lxb_codepoint_t cp) +{ + if (cp >= 0xFDD0 && cp <= 0xFDEF) { + return true; + } + + switch (cp) { + case 0xFFFE: + case 0xFFFF: + case 0x1FFFE: + case 0x1FFFF: + case 0x2FFFE: + case 0x2FFFF: + case 0x3FFFE: + case 0x3FFFF: + case 0x4FFFE: + case 0x4FFFF: + case 0x5FFFE: + case 0x5FFFF: + case 0x6FFFE: + case 0x6FFFF: + case 0x7FFFE: + case 0x7FFFF: + case 0x8FFFE: + case 0x8FFFF: + case 0x9FFFE: + case 0x9FFFF: + case 0xAFFFE: + case 0xAFFFF: + case 0xBFFFE: + case 0xBFFFF: + case 0xCFFFE: + case 0xCFFFF: + case 0xDFFFE: + case 0xDFFFF: + case 0xEFFFE: + case 0xEFFFF: + case 0xFFFFE: + case 0xFFFFF: + case 0x10FFFE: + case 0x10FFFF: + return true; + + default: + break; + } + + return false; +} + +lxb_inline bool +lxb_url_is_url_codepoint(lxb_codepoint_t cp) +{ + if (cp >= 0x00A0 && cp <= 0x1FFFFF) { + /* Leading and trailing surrogate. */ + if ((cp >= 0xD800 && cp <= 0xDFFF)) { + return false; + } + + return !(cp > 0x10FFFF || lxb_url_is_noncharacter(cp)); + } + + return lxb_url_codepoint_alphanumeric[(lxb_char_t) cp] != 0xFF; +} + +lxb_inline bool +lxb_url_is_special(const lxb_url_t *url) +{ + return url->scheme.type != LXB_URL_SCHEMEL_TYPE__UNKNOWN; +} + +lxb_inline const lxb_url_scheme_data_t * +lxb_url_scheme_by_type(lxb_url_scheme_type_t type) +{ + return &lxb_url_scheme_res[type]; +} + +lxb_inline bool +lxb_url_scheme_is_special(const lxb_url_scheme_data_t *scheme) +{ + return scheme->type != LXB_URL_SCHEMEL_TYPE__UNKNOWN; +} + +lxb_inline bool +lxb_url_scheme_is_equal(const lxb_url_scheme_t *first, + const lxb_url_scheme_t *second) +{ + if (first->type != second->type) { + return false; + } + + if (first->type == LXB_URL_SCHEMEL_TYPE__UNKNOWN) { + if (first->name.length != second->name.length) { + return false; + } + + return memcmp(first->name.data, second->name.data, + first->name.length) == 0; + } + + return true; +} + +lxb_inline bool +lxb_url_scheme_equal_port(lxb_url_scheme_type_t type, uint16_t port) +{ + return lxb_url_scheme_res[type].port == port; +} + +static lxb_status_t +lxb_url_scheme_copy(const lxb_url_scheme_t *src, lxb_url_scheme_t *dst, + lexbor_mraw_t *dst_mraw) +{ + dst->type = src->type; + + return lxb_url_str_copy(&src->name, &dst->name, dst_mraw); +} + +static lxb_status_t +lxb_url_scheme_copy_special(const lxb_url_scheme_data_t *src, + lxb_url_scheme_t *dst, lexbor_mraw_t *dst_mraw) +{ + dst->type = src->type; + + return lxb_url_str_copy(&src->name, &dst->name, dst_mraw); +} + +static void +lxb_url_path_set_null(lxb_url_t *url) +{ + if (url->path.str.data == NULL) { + return; + } + + (void) lexbor_str_destroy(&url->path.str, url->mraw, false); + + url->path.str.length = 0; + url->path.length = 0; + url->path.opaque = false; +} + +static lxb_status_t +lxb_url_path_copy(const lxb_url_t *src, lxb_url_t *dst) +{ + lexbor_str_t *to; + const lexbor_str_t *str; + + if (dst->path.str.data != NULL) { + lxb_url_path_set_null(dst); + } + + dst->path.opaque = src->path.opaque; + + if (src->path.str.data == NULL) { + return LXB_STATUS_OK; + } + + dst->path.length = src->path.length; + + str = &src->path.str; + to = &dst->path.str; + + to->data = lexbor_mraw_alloc(dst->mraw, (str->length + 1)); + if (to->data == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + /* +1 == '\0' */ + memcpy(to->data, str->data, str->length + 1); + + to->length = str->length; + + return LXB_STATUS_OK; +} + +static void +lxb_url_path_shorten(lxb_url_t *url) +{ + lexbor_str_t *str; + lxb_char_t *p, *begin; + + str = &url->path.str; + + if (url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE + && url->path.length == 1) + { + if (str->length >= 3 + && lxb_url_is_windows_letter(str->data[1]) + && str->data[2] == ':') + { + return; + } + } + + if (url->path.str.data != NULL) { + url->path.length -= 1; + + begin = str->data; + p = begin + str->length; + + while (p > begin) { + p -= 1; + + if (*p == '/') { + *p = '\0'; + break; + } + } + + str->length = p - begin; + } +} + +static lxb_status_t +lxb_url_path_append_wo_slash(lxb_url_t *url, + const lxb_char_t *data, size_t length) +{ + lxb_char_t *p; + + if (url->path.str.data == NULL) { + p = lexbor_str_init(&url->path.str, url->mraw, length); + if (p == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + + p = lexbor_str_append(&url->path.str, url->mraw, data, length); + + return (p != NULL) ? LXB_STATUS_OK : LXB_STATUS_ERROR_MEMORY_ALLOCATION; +} + +static lxb_status_t +lxb_url_path_append(lxb_url_t *url, const lxb_char_t *data, size_t length) +{ + size_t len; + lxb_char_t *p; + lexbor_str_t *str; + + str = &url->path.str; + + if (str->data == NULL) { + p = lexbor_str_init(str, url->mraw, length + 1); + if (p == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + } + + len = str->length; + str->length += 1; + + p = lexbor_str_append(&url->path.str, url->mraw, data, length); + + str->data[len] = '/'; + + return (p != NULL) ? LXB_STATUS_OK : LXB_STATUS_ERROR_MEMORY_ALLOCATION; +} + +static lxb_status_t +lxb_url_path_list_push(lxb_url_t *url, lexbor_str_t *data) +{ + return lxb_url_path_append_wo_slash(url, data->data, data->length); +} + +lxb_inline lxb_status_t +lxb_url_query_copy(const lexbor_str_t *src, lexbor_str_t *dst, + lexbor_mraw_t *dst_mraw) +{ + return lxb_url_str_copy(src, dst, dst_mraw); +} + +lxb_inline lxb_status_t +lxb_url_username_copy(const lexbor_str_t *src, lexbor_str_t *dst, + lexbor_mraw_t *dst_mraw) +{ + return lxb_url_str_copy(src, dst, dst_mraw); +} + +lxb_inline lxb_status_t +lxb_url_password_copy(const lexbor_str_t *src, lexbor_str_t *dst, + lexbor_mraw_t *dst_mraw) +{ + return lxb_url_str_copy(src, dst, dst_mraw); +} + +static lxb_status_t +lxb_url_host_copy(const lxb_url_host_t *src, lxb_url_host_t *dst, + lexbor_mraw_t *dst_mraw) +{ + if (dst->type != LXB_URL_HOST_TYPE__UNDEF) { + if (src->type == LXB_URL_HOST_TYPE__UNDEF) { + if (dst->type <= LXB_URL_HOST_TYPE_OPAQUE) { + (void) lexbor_str_destroy(&dst->u.domain, dst_mraw, false); + } + + dst->type = LXB_URL_HOST_TYPE__UNDEF; + + return LXB_STATUS_OK; + } + + if (dst->type <= LXB_URL_HOST_TYPE_OPAQUE) { + if (src->type <= LXB_URL_HOST_TYPE_OPAQUE) { + dst->type = src->type; + + return lxb_url_str_copy(&src->u.domain, + &dst->u.domain, dst_mraw); + } + + (void) lexbor_str_destroy(&dst->u.domain, dst_mraw, false); + } + } + + if (src->type <= LXB_URL_HOST_TYPE_OPAQUE) { + dst->type = src->type; + + if (src->type == LXB_URL_HOST_TYPE__UNDEF) { + return LXB_STATUS_OK; + } + + return lxb_url_str_copy(&src->u.domain, + &dst->u.domain, dst_mraw); + } + + if (src->type == LXB_URL_HOST_TYPE_IPV6) { + memcpy(dst->u.ipv6, src->u.ipv6, sizeof(src->u.ipv6)); + } + else { + dst->u.ipv4 = src->u.ipv4; + } + + return LXB_STATUS_OK; +} + +lxb_inline void +lxb_url_host_destroy(lxb_url_host_t *host, lexbor_mraw_t *mraw) +{ + if (host->type != LXB_URL_HOST_TYPE__UNDEF) { + if (host->type <= LXB_URL_HOST_TYPE_OPAQUE) { + (void) lexbor_str_destroy(&host->u.domain, mraw, false); + } + } +} + +static void +lxb_url_host_set_empty(lxb_url_host_t *host, lexbor_mraw_t *mraw) +{ + lxb_url_host_destroy(host, mraw); + + host->type = LXB_URL_HOST_TYPE_EMPTY; +} + +static bool +lxb_url_host_eq(lxb_url_host_t *host, const lxb_char_t *data, size_t length) +{ + lexbor_str_t *str; + + if (host->type != LXB_URL_HOST_TYPE__UNDEF) { + if (host->type <= LXB_URL_HOST_TYPE_OPAQUE) { + str = &host->u.domain; + + return str->length == length + && memcmp(data, str->data, length) == 0; + } + } + + return false; +} + +lxb_inline void +lxb_url_port_set(lxb_url_t *url, uint16_t port) +{ + url->port = port; + url->has_port = true; +} + +static void +lxb_url_fragment_set_null(lxb_url_t *url) +{ + if (url->fragment.data != NULL) { + (void) lexbor_str_destroy(&url->fragment, url->mraw, false); + } +} + +lxb_inline bool +lxb_url_includes_credentials(const lxb_url_t *url) +{ + return url->username.length != 0 || url->password.length != 0; +} + +static bool +lxb_url_start_windows_drive_letter(const lxb_char_t *data, + const lxb_char_t *end) +{ + size_t length = end - data; + + if (length < 2) { + return false; + } + + if (!lxb_url_is_windows_letter(data[0]) + || (data[1] != ':' && data[1] != '|')) + { + return false; + } + + if (length > 2 && !( data[2] == '/' || data[2] == '\\' + || data[2] == '?' || data[2] == '#')) + { + return false; + } + + return true; +} + +static bool +lxb_url_windows_drive_letter(const lxb_char_t *data, const lxb_char_t *end) +{ + size_t length = end - data; + + if (length < 2) { + return false; + } + + return lxb_url_is_windows_letter(data[0]) + && (data[1] == ':' || data[1] == '|'); +} + +static bool +lxb_url_normalized_windows_drive_letter(const lxb_char_t *data, + const lxb_char_t *end) +{ + size_t length = end - data; + + if (length < 2) { + return false; + } + + return lxb_url_is_windows_letter(data[0]) && data[1] == ':'; +} + +static bool +lxb_url_cannot_have_user_pass_port(lxb_url_t *url) +{ + return url->host.type == LXB_URL_HOST_TYPE_EMPTY + || url->host.type == LXB_URL_HOST_TYPE__UNDEF + || url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE; +} + +lxb_url_t * +lxb_url_parse(lxb_url_parser_t *parser, const lxb_url_t *base_url, + const lxb_char_t *data, size_t length) +{ + (void) lxb_url_parse_basic(parser, NULL, base_url, data, length, + LXB_URL_STATE__UNDEF); + return parser->url; +} + +lxb_status_t +lxb_url_parse_basic(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_url_t *base_url, + const lxb_char_t *data, size_t length, + lxb_url_state_t override_state) +{ + lxb_status_t status; + + status = lxb_url_parse_basic_h(parser, url, base_url, data, + length, override_state); + if (status != LXB_STATUS_OK) { + if (parser->url != url) { + parser->url = lxb_url_destroy(parser->url); + } + + return status; + } + + return LXB_STATUS_OK; +} + +static lxb_status_t +lxb_url_parse_basic_h(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_url_t *base_url, + const lxb_char_t *data, size_t length, + lxb_url_state_t override_state) +{ + bool at_sign, inside_bracket; + size_t len; + uint32_t port; + lxb_status_t status; + lexbor_str_t tmp_str; + lxb_url_state_t state; + const lxb_char_t *p, *begin, *end, *tmp, *pswd, *buf, *orig_data; + lxb_char_t c; + lxb_codepoint_t cp; + lxb_url_map_type_t map_type; + const lxb_url_scheme_data_t *schm; + lxb_url_host_opt_t opt; + + static const lexbor_str_t mp_str = lexbor_str(""); + static const lexbor_str_t lh_str = lexbor_str("localhost"); + + if (url == NULL) { + url = lexbor_mraw_calloc(parser->mraw, sizeof(lxb_url_t)); + if (url == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + url->mraw = parser->mraw; + + status = lxb_url_leading_trailing(parser, &data, &length); + if (status != LXB_STATUS_OK) { + return status; + } + } + + parser->url = url; + orig_data = data; + + buf = lxb_url_remove_tab_newline(parser, data, &length); + if (buf != data) { + if (buf == NULL) { + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; + } + + data = buf; + } + + state = LXB_URL_STATE_SCHEME_START_STATE; + + if (override_state != LXB_URL_STATE__UNDEF) { + state = override_state; + } + + inside_bracket = false; + + p = data; + end = data + length; + + /* And go. */ + + schm = lxb_url_scheme_by_type(LXB_URL_SCHEMEL_TYPE__UNDEF); + +again: + + switch (state) { + case LXB_URL_STATE_SCHEME_START_STATE: + if (p >= end || lexbor_str_res_alpha_character[*p] == 0xff) { + if (override_state == LXB_URL_STATE__UNDEF) { + state = LXB_URL_STATE_NO_SCHEME_STATE; + goto again; + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_UNEXPECTED_DATA); + } + + /* Fall through. */ + + case LXB_URL_STATE_SCHEME_STATE: + do { + p++; + } + while (p < end && (lexbor_str_res_alphanumeric_character[*p] != 0xff + || *p == '+' || *p == '-' || *p == '.')); + + if (p >= end || *p != ':') { + if (override_state == LXB_URL_STATE__UNDEF) { + p = data; + + state = LXB_URL_STATE_NO_SCHEME_STATE; + goto again; + } + else if (p < end || override_state != LXB_URL_STATE_SCHEME_START_STATE) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_UNEXPECTED_DATA); + } + } + + schm = lxb_url_scheme_find(data, p - data); + + if (override_state != LXB_URL_STATE__UNDEF) { + if (lxb_url_is_special(url) != lxb_url_scheme_is_special(schm)) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + if (url->has_port || lxb_url_includes_credentials(url)) { + if (schm->type == LXB_URL_SCHEMEL_TYPE_FILE) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + } + + if (url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE + && url->host.type == LXB_URL_HOST_TYPE_EMPTY) + { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + } + + url->scheme.type = schm->type; + url->scheme.name.length = 0; + + status = lxb_url_str_init(&url->scheme.name, url->mraw, p - data); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_MEMORY_ALLOCATION); + } + + (void) lexbor_str_append_lowercase(&url->scheme.name, url->mraw, + data, p - data); + + p += 1; + + if (override_state != LXB_URL_STATE__UNDEF) { + if (url->has_port && url->port == schm->port) { + url->port = 0; + url->has_port = false; + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + if (schm->type == LXB_URL_SCHEMEL_TYPE_FILE) { + if (end - p < 2 || p[0] != '/' || p[1] != '/') { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + + state = LXB_URL_STATE_FILE_STATE; + goto again; + } + + if (lxb_url_scheme_is_special(schm)) { + if (base_url != NULL + && lxb_url_scheme_is_equal(&url->scheme, &base_url->scheme)) + { + state = LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE; + } + else { + state = LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE; + } + + goto again; + } + + if (p < end && *p == '/') { + p += 1; + state = LXB_URL_STATE_PATH_OR_AUTHORITY_STATE; + goto again; + } + + lxb_url_path_set_null(url); + + state = LXB_URL_STATE_OPAQUE_PATH_STATE; + goto again; + + case LXB_URL_STATE_NO_SCHEME_STATE: + if (base_url == NULL) { + goto failed_non_relative_url; + } + + if (base_url->path.opaque) { + if (p >= end || *p != '#') { + goto failed_non_relative_url; + } + + p += 1; + + status = lxb_url_scheme_copy(&base_url->scheme, + &url->scheme, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_path_copy(base_url, url); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_query_copy(&base_url->query, &url->query, + url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_fragment_set_null(url); + + state = LXB_URL_STATE_FRAGMENT_STATE; + goto again; + } + + if (base_url->scheme.type != LXB_URL_SCHEMEL_TYPE_FILE) { + state = LXB_URL_STATE_RELATIVE_STATE; + goto again; + } + + state = LXB_URL_STATE_FILE_STATE; + goto again; + + case LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE: + if (end - p > 1 && p[0] == '/' && p[1] == '/') { + p += 2; + state = LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE; + goto again; + } + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + state = LXB_URL_STATE_RELATIVE_STATE; + goto again; + + case LXB_URL_STATE_PATH_OR_AUTHORITY_STATE: + if (p < end && *p == '/') { + p += 1; + state = LXB_URL_STATE_AUTHORITY_STATE; + } + else { + state = LXB_URL_STATE_PATH_STATE; + } + + goto again; + + case LXB_URL_STATE_RELATIVE_STATE: + status = lxb_url_scheme_copy(&base_url->scheme, &url->scheme, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (end - p >= 1) { + if (*p == '/') { + p += 1; + state = LXB_URL_STATE_RELATIVE_SLASH_STATE; + goto again; + } + + if (lxb_url_is_special(url) && *p == '\\') { + p += 1; + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + state = LXB_URL_STATE_RELATIVE_SLASH_STATE; + goto again; + } + } + + status = lxb_url_username_copy(&base_url->username, &url->username, + url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_password_copy(&base_url->password, &url->password, + url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_host_copy(&base_url->host, &url->host, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (base_url->has_port) { + lxb_url_port_set(url, base_url->port); + } + + status = lxb_url_path_copy(base_url, url); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_query_copy(&base_url->query, &url->query, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (end - p == 0) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + if (*p == '?') { + p += 1; + + (void) lexbor_str_destroy(&url->query, url->mraw, false); + + state = LXB_URL_STATE_QUERY_STATE; + goto again; + } + + if (*p == '#') { + p += 1; + + (void) lexbor_str_destroy(&url->fragment, url->mraw, false); + + state = LXB_URL_STATE_FRAGMENT_STATE; + goto again; + } + + (void) lexbor_str_destroy(&url->query, url->mraw, false); + + lxb_url_path_shorten(url); + + state = LXB_URL_STATE_PATH_STATE; + goto again; + + case LXB_URL_STATE_RELATIVE_SLASH_STATE: + c = (end - p >= 1) ? *p : '\0'; + + if (lxb_url_is_special(url) && (c == '/' || c == '\\')) { + if (c == '\\') { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + + p += 1; + state = LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE; + goto again; + } + + if (c == '/') { + p += 1; + state = LXB_URL_STATE_AUTHORITY_STATE; + goto again; + } + + status = lxb_url_username_copy(&base_url->username, &url->username, + url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_password_copy(&base_url->password, &url->password, + url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_host_copy(&base_url->host, &url->host, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (base_url->has_port) { + lxb_url_port_set(url, base_url->port); + } + + state = LXB_URL_STATE_PATH_STATE; + + goto again; + + case LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE: + state = LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE; + + if (end - p > 1 && p[0] == '/' && p[1] == '/') { + p += 2; + } + else { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + + goto again; + + case LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE: + if (p >= end || (*p != '/' && *p != '\\')) { + state = LXB_URL_STATE_AUTHORITY_STATE; + goto again; + } + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + p += 1; + goto again; + + case LXB_URL_STATE_AUTHORITY_STATE: + begin = p; + pswd = NULL; + at_sign = false; + + while (p < end) { + c = *p; + + switch (c) { + case '@': + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (p == begin) { + at_sign = true; + break; + } + + if (pswd == NULL || !at_sign) { + tmp = (pswd != NULL) ? pswd - 1 : p; + + if (tmp > begin) { + status = lxb_url_percent_encode_after_utf_8(begin, tmp, + &url->username, url->mraw, + LXB_URL_MAP_USERINFO, false); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + } + + if (pswd != NULL && p > pswd) { + status = lxb_url_percent_encode_after_utf_8(pswd, p, + &url->password, url->mraw, + LXB_URL_MAP_USERINFO, false); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + pswd = p; + } + + at_sign = true; + begin = p; + + break; + + case ':': + if (pswd == NULL) { + pswd = p + 1; + } + + break; + + case '/': + case '\\': + case '?': + case '#': + if (c == '\\') { + if (!lxb_url_is_special(url)) { + p += 1; + continue; + } + } + + goto authority_done; + + default: + break; + } + + p += 1; + } + + authority_done: + + if (at_sign) { + if (begin == p || begin == p - 1) { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_UNEXPECTED_DATA); + } + + /* Skip '@'. */ + begin += 1; + } + + p = begin; + + state = LXB_URL_STATE_HOST_STATE; + goto again; + + case LXB_URL_STATE_HOST_STATE: + case LXB_URL_STATE_HOSTNAME_STATE: + begin = p; + + if (override_state != LXB_URL_STATE__UNDEF + && url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE) + { + p -= 1; + state = LXB_URL_STATE_FILE_HOST_STATE; + goto again; + } + + opt = !lxb_url_is_special(url); + + for (; p < end; p++) { + c = *p; + + switch (c) { + case '/': + case '?': + case '#': + goto host_done; + + case '\\': + if (!lxb_url_is_special(url)) { + break; + } + + goto host_done; + + case ':': + if (inside_bracket) { + break; + } + + if (p == begin) { + goto failed_host; + } + + if (override_state == LXB_URL_STATE_HOSTNAME_STATE) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + status = lxb_url_host_parse(parser, begin, p, &url->host, + url->mraw, opt); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + p += 1; + + state = LXB_URL_STATE_PORT_STATE; + goto again; + + case '[': + inside_bracket = true; + break; + + case ']': + inside_bracket = false; + break; + + case '%': + opt |= LXB_URL_HOST_OPT_DECODE; + break; + + case 'X': + case 'x': + if (p + 4 <= end && (p[1] == 'n' || p[1] == 'N') + && p[2] == '-' && p[3] == '-') + { + opt |= LXB_URL_HOST_OPT_IDNA; + p += 3; + } + + break; + + default: + if (c >= 0x80) { + opt |= LXB_URL_HOST_OPT_IDNA; + } + + break; + } + } + + host_done: + + if (begin == p && lxb_url_is_special(url)) { + goto failed_host; + } + + if (override_state != LXB_URL_STATE__UNDEF && begin == p + && (lxb_url_includes_credentials(url) || url->has_port)) + { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + status = lxb_url_host_parse(parser, begin, p, &url->host, + url->mraw, opt); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (override_state != LXB_URL_STATE__UNDEF) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + state = LXB_URL_STATE_PATH_START_STATE; + goto again; + + case LXB_URL_STATE_PORT_STATE: + begin = p; + + while (true) { + c = (p < end) ? *p : '\0'; + + if (c >= '0' && c <= '9') { + p += 1; + continue; + } + + if (p >= end || c == '/' || c == '?' || c == '#' + || (lxb_url_is_special(url) && c == '\\') + || override_state != LXB_URL_STATE__UNDEF) + { + if (begin == p) { + if (override_state != LXB_URL_STATE__UNDEF) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + state = LXB_URL_STATE_PATH_START_STATE; + goto again; + } + + port = 0; + + while (begin < p) { + port = lexbor_str_res_map_num[*begin++] + port * 10; + + if (port > 65535) { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_PORT_OUT_OF_RANGE); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_parse_return(orig_data, buf, + LXB_STATUS_ERROR_UNEXPECTED_DATA); + } + } + + if (!lxb_url_is_special(url) + || !lxb_url_scheme_equal_port(url->scheme.type, port)) + { + url->port = port; + url->has_port = true; + } + else { + url->port = 0; + url->has_port = false; + } + + if (override_state != LXB_URL_STATE__UNDEF) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + state = LXB_URL_STATE_PATH_START_STATE; + goto again; + } + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_PORT_INVALID); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_UNEXPECTED_DATA); + } + + break; + + case LXB_URL_STATE_FILE_STATE: + schm = lxb_url_scheme_by_type(LXB_URL_SCHEMEL_TYPE_FILE); + + status = lxb_url_scheme_copy_special(schm, &url->scheme, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_host_set_empty(&url->host, url->mraw); + + c = (p < end) ? *p : '\0'; + + if (c == '/' || c == '\\') { + if (c == '\\') { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + + p += 1; + + state = LXB_URL_STATE_FILE_SLASH_STATE; + goto again; + } + + if (base_url != NULL + && base_url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE) + { + status = lxb_url_host_copy(&base_url->host, &url->host, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_path_copy(base_url, url); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_query_copy(&base_url->query, &url->query, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (c == '?') { + p += 1; + + (void) lexbor_str_destroy(&url->query, url->mraw, false); + + state = LXB_URL_STATE_QUERY_STATE; + goto again; + } + + if (c == '#') { + p += 1; + + (void) lexbor_str_destroy(&url->fragment, url->mraw, false); + + state = LXB_URL_STATE_FRAGMENT_STATE; + goto again; + } + + if (p >= end) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + (void) lexbor_str_destroy(&url->query, url->mraw, false); + + if (!lxb_url_start_windows_drive_letter(p, end)) { + lxb_url_path_shorten(url); + } + else { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_path_set_null(url); + url->path.opaque = true; + } + } + + state = LXB_URL_STATE_PATH_STATE; + goto again; + + case LXB_URL_STATE_FILE_SLASH_STATE: + c = (p < end) ? *p : '\0'; + + if (c == '/' || c == '\\') { + if (c == '\\') { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + + p += 1; + + state = LXB_URL_STATE_FILE_HOST_STATE; + goto again; + } + + if (base_url != NULL + && base_url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE) + { + status = lxb_url_host_copy(&base_url->host, &url->host, url->mraw); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (!lxb_url_start_windows_drive_letter(p, end) + && !base_url->path.opaque && base_url->path.length >= 1) + { + tmp = lxb_url_path_part_by_index(base_url, + base_url->path.str.data[0] == '/', &len); + + if (tmp != NULL + && lxb_url_normalized_windows_drive_letter(tmp, tmp + len)) + { + len = (tmp + len) - base_url->path.str.data; + + status = lxb_url_path_append_wo_slash(url, + base_url->path.str.data, len); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + } + } + + state = LXB_URL_STATE_PATH_STATE; + goto again; + + case LXB_URL_STATE_FILE_HOST_STATE: + begin = p; + opt = !lxb_url_is_special(url); + + while (true) { + if (p < end) { + switch (*p) { + case '/': + case '\\': + case '?': + case '#': + break; + + case '%': + p += 1; + opt |= LXB_URL_HOST_OPT_DECODE; + continue; + + case 'X': + case 'x': + if (p + 4 <= end && (p[1] == 'n' || p[1] == 'N') + && p[2] == '-' && p[3] == '-') + { + opt |= LXB_URL_HOST_OPT_IDNA; + p += 3; + } + + p += 1; + continue; + + default: + if (*p >= 0x80) { + opt |= LXB_URL_HOST_OPT_IDNA; + } + + p += 1; + continue; + } + } + + if (override_state == LXB_URL_STATE__UNDEF && p - begin == 2 + && lxb_url_windows_drive_letter(begin, p)) + { + status = lxb_url_log_append(parser, begin, + LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER_HOST); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + p = begin; + + state = LXB_URL_STATE_PATH_STATE; + goto again; + } + + if (p == begin) { + lxb_url_host_set_empty(&url->host, url->mraw); + + if (override_state != LXB_URL_STATE__UNDEF) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + state = LXB_URL_STATE_PATH_START_STATE; + goto again; + } + + status = lxb_url_host_parse(parser, begin, p, &url->host, + url->mraw, opt); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (lxb_url_host_eq(&url->host, lh_str.data, lh_str.length)) { + lxb_url_host_set_empty(&url->host, url->mraw); + } + + if (override_state != LXB_URL_STATE__UNDEF) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + state = LXB_URL_STATE_PATH_START_STATE; + goto again; + } + + break; + + case LXB_URL_STATE_PATH_START_STATE: + c = (p < end) ? *p : '\0'; + + if (lxb_url_is_special(url)) { + if (c == '\\') { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + p += 1; + } + else if (c == '/') { + p += 1; + } + + state = LXB_URL_STATE_PATH_STATE; + goto again; + } + + if (override_state == LXB_URL_STATE__UNDEF) { + if (c == '?') { + p += 1; + state = LXB_URL_STATE_QUERY_STATE; + goto again; + } + + if (c == '#') { + p += 1; + state = LXB_URL_STATE_FRAGMENT_STATE; + goto again; + } + } + + if (p < end) { + if (c == '/') { + p += 1; + } + + state = LXB_URL_STATE_PATH_STATE; + goto again; + } + + if (override_state != LXB_URL_STATE__UNDEF + && url->host.type == LXB_URL_HOST_TYPE__UNDEF) + { + status = lxb_url_path_append(url, mp_str.data, mp_str.length); + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + + case LXB_URL_STATE_PATH_STATE: + p = lxb_url_path_fast_path(parser, url, p, end, + override_state == LXB_URL_STATE__UNDEF); + if (p == NULL) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_MEMORY_ALLOCATION); + } + + if (p >= end) { + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + switch (*p) { + case '?': + p += 1; + state = LXB_URL_STATE_QUERY_STATE; + break; + case '#': + p += 1; + state = LXB_URL_STATE_FRAGMENT_STATE; + break; + } + + goto again; + + case LXB_URL_STATE_OPAQUE_PATH_STATE: + begin = p; + url->path.opaque = true; + + while (true) { + if (p >= end) { + tmp_str.data = NULL; + + status = lxb_url_percent_encode_after_utf_8(begin, p, + &tmp_str, url->mraw, + LXB_URL_MAP_C0, false); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_path_list_push(url, &tmp_str); + + lxb_url_parse_return(orig_data, buf, status); + } + + c = *p; + + if (c == '#' || c == '?') { + tmp_str.data = NULL; + + status = lxb_url_percent_encode_after_utf_8(begin, p, + &tmp_str, url->mraw, + LXB_URL_MAP_C0, false); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + status = lxb_url_path_list_push(url, &tmp_str); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + p += 1; + + if (c == '#') { + state = LXB_URL_STATE_FRAGMENT_STATE; + } + else { + state = LXB_URL_STATE_QUERY_STATE; + } + + goto again; + } + + tmp = p; + cp = lxb_url_decode_valid_utf_8_single(&p, end); + + if ((!lxb_url_is_url_codepoint(cp) && cp != '%') + || (cp == '%' && (end - p < 2 + || lexbor_str_res_map_hex[p[0]] == 0xff + || lexbor_str_res_map_hex[p[1]] == 0xff))) + { + status = lxb_url_log_append(parser, tmp, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + } + + break; + + case LXB_URL_STATE_QUERY_STATE: + begin = p; + + while (true) { + c = (p < end) ? *p : '\0'; + + if (p >= end || (override_state == LXB_URL_STATE__UNDEF && *p == '#')) { + if (lxb_url_is_special(url)) { + map_type = LXB_URL_MAP_SPECIAL_QUERY; + } + else { + map_type = LXB_URL_MAP_QUERY; + } + + status = lxb_url_percent_encode_after_utf_8(begin, p, + &url->query, + url->mraw, + map_type, false); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + if (p < end) { + p += 1; + state = LXB_URL_STATE_FRAGMENT_STATE; + goto again; + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + } + + tmp = p; + cp = lxb_url_decode_valid_utf_8_single(&p, end); + + if ((!lxb_url_is_url_codepoint(cp) && cp != '%') + || (cp == '%' && (end - p < 2 + || lexbor_str_res_map_hex[p[0]] == 0xff + || lexbor_str_res_map_hex[p[1]] == 0xff))) + { + status = lxb_url_log_append(parser, tmp, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + } + + break; + + case LXB_URL_STATE_FRAGMENT_STATE: + begin = p; + + while (p < end) { + tmp = p; + cp = lxb_url_decode_valid_utf_8_single(&p, end); + + if ((!lxb_url_is_url_codepoint(cp) && cp != '%') + || (cp == '%' && (end - p < 2 + || lexbor_str_res_map_hex[p[0]] == 0xff + || lexbor_str_res_map_hex[p[1]] == 0xff))) + { + status = lxb_url_log_append(parser, tmp, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + } + } + + status = lxb_url_percent_encode_after_utf_8(begin, p, &url->fragment, + url->mraw, + LXB_URL_MAP_FRAGMENT, false); + lxb_url_parse_return(orig_data, buf, status); + + default: + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR); + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_OK); + +failed_non_relative_url: + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_MISSING_SCHEME_NON_RELATIVE_URL); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_UNEXPECTED_DATA); + +failed_host: + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_HOST_MISSING); + if (status != LXB_STATUS_OK) { + lxb_url_parse_return(orig_data, buf, status); + } + + lxb_url_parse_return(orig_data, buf, LXB_STATUS_ERROR_UNEXPECTED_DATA); +} + +static const lxb_char_t * +lxb_url_path_fast_path(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_char_t *p, const lxb_char_t *end, bool bqs) +{ + size_t count; + lxb_char_t x, c; + lxb_status_t status; + const lxb_char_t *begin, *last; + + begin = p; + last = p; + count = url->path.length; + + for (; p < end; p++) { + c = *p; + x = lxb_url_path_map[c]; + + if (x != 0x00) { + if (c == '/') { + count += 1; + last = p + 1; + } + else if (c == '%') { + if (end - p < 3 + || lexbor_str_res_map_hex[p[1]] == 0xff + || lexbor_str_res_map_hex[p[2]] == 0xff) + { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + return NULL; + } + + p = (end - p < 3) ? end - 1 : p + 2; + } + else if (p[1] == '2' && (p[2] == 'e' || p[2] == 'E') + && (p == begin + || p[-1] == '/' + || (p[-1] == '\\' && lxb_url_is_special(url)))) + { + url->path.length = count; + + status = lxb_url_path_try_dot(url, &begin, &last, + &p, end, bqs); + if (status != LXB_STATUS_OK) { + return NULL; + } + + count = url->path.length; + } + else { + p += 2; + } + } + else if ((c == '?' || c == '#') && bqs) { + break; + } + else if (c == '\\' && lxb_url_is_special(url)) { + count += 1; + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + return NULL; + } + + status = lxb_url_path_append(url, begin, p - begin); + if (status != LXB_STATUS_OK) { + return NULL; + } + + begin = p + 1; + last = p + 1; + } + else if (c == '.') { + if (p == begin + || p[-1] == '/' + || (p[-1] == '\\' && lxb_url_is_special(url))) + { + url->path.length = count; + + status = lxb_url_path_try_dot(url, &begin, &last, + &p, end, bqs); + if (status != LXB_STATUS_OK) { + return NULL; + } + + count = url->path.length; + } + } + else { + url->path.length = count; + + if (last - 1 > begin) { + status = lxb_url_path_append(url, begin, + (last - 1) - begin); + if (status != LXB_STATUS_OK) { + return NULL; + } + } + + return lxb_url_path_slow_path(parser, url, last, end, bqs); + } + } + } + + status = lxb_url_path_append(url, begin, p - begin); + if (status != LXB_STATUS_OK) { + return NULL; + } + + if (count == 0 || p != begin) { + count += 1; + } + + url->path.length = count; + + return p; +} + +/* + * The lxb_url_path_slow_path() function should not be static. Otherwise, the + * compiler will inline it, which will lead to cache problems and slower code + * execution. + */ +const lxb_char_t * +lxb_url_path_slow_path(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_char_t *data, const lxb_char_t *end, bool bqs) +{ + size_t len, count; + lxb_char_t c, *last, *sbuf, *sbuf_begin; + lexbor_str_t *str; + lxb_status_t status; + lxb_codepoint_t cp; + const lxb_char_t *p, *tmp, *sbuf_end; + lxb_char_t sbuffer[1024]; + + p = data; + sbuf = sbuffer; + sbuf_begin = sbuffer; + sbuf_end = sbuffer + sizeof(sbuffer); + + c = '\0'; + + if (url->path.str.length != 0) { + str = &url->path.str; + + /* "+ 2" == "/\0" */ + if (sbuf + (str->length + 2) > sbuf_end) { + len = str->length + sizeof(sbuffer); + + sbuf_begin = lexbor_malloc(len); + if (sbuf_begin == NULL) { + return NULL; + } + + sbuf = sbuf_begin; + sbuf_end = sbuf + len; + } + + memcpy(sbuf, str->data, str->length); + sbuf += str->length; + + str->length = 0; + } + + *sbuf++ = '/'; + + last = sbuf; + count = url->path.length; + + while (p < end) { + c = *p; + + if (c >= 0x80) { + tmp = p; + cp = lxb_url_decode_valid_utf_8_single(&p, end); + + if (!lxb_url_is_url_codepoint(cp)) { + status = lxb_url_log_append(parser, tmp, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + goto failed; + } + } + + len = (p - tmp) * 3; + + if (sbuf + len + 1 >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, sbuf_end, sbuffer, last); + } + + while (tmp < p) { + c = *tmp++; + + *sbuf++ = '%'; + *sbuf++ = lexbor_str_res_char_to_two_hex_value[c][0]; + *sbuf++ = lexbor_str_res_char_to_two_hex_value[c][1]; + } + + continue; + } + + if (c == '/') { + *sbuf++ = '/'; + if (sbuf >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, sbuf_end, sbuffer, last); + } + + lxb_url_path_fix_windows_drive(url, last, sbuf, count); + + count += 1; + last = sbuf; + + if (p + 1 >= end) { + count += 1; + } + } + else if (c == '\\' && lxb_url_is_special(url)) { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS); + if (status != LXB_STATUS_OK) { + goto failed; + } + + *sbuf++ = '/'; + if (sbuf >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, sbuf_end, sbuffer, last); + } + + lxb_url_path_fix_windows_drive(url, last, sbuf, count); + + count += 1; + last = sbuf; + + if (p + 1 >= end) { + count += 1; + } + } + else if ((c == '?' || c == '#') && bqs) { + lxb_url_path_fix_windows_drive(url, last, sbuf, count); + + count += 1; + last = sbuf; + break; + } + else if (lxb_url_map[c] & LXB_URL_MAP_PATH) { + if (sbuf + 4 >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, sbuf_end, sbuffer, last); + } + + *sbuf++ = '%'; + *sbuf++ = lexbor_str_res_char_to_two_hex_value[c][0]; + *sbuf++ = lexbor_str_res_char_to_two_hex_value[c][1]; + } + else if (c == '.') { + if (last == sbuf) { + tmp = lxb_url_path_dot_count(url, p, end, sbuf_begin, + &sbuf, &last, &count, bqs); + + if (tmp != p) { + p = tmp + 1; + continue; + } + } + + *sbuf++ = '.'; + if (sbuf >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, + sbuf_end, sbuffer, last); + } + } + else if (c == '%') { + if (end - p < 3 + || lexbor_str_res_map_hex[p[1]] == 0xff + || lexbor_str_res_map_hex[p[2]] == 0xff) + { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + goto failed; + } + } + else if (p[1] == '2' && (p[2] == 'e' || p[2] == 'E') + && last == sbuf) + { + tmp = lxb_url_path_dot_count(url, p, end, sbuf_begin, + &sbuf, &last, &count, bqs); + + if (tmp != p) { + p = tmp + 1; + continue; + } + } + + *sbuf++ = '%'; + if (sbuf >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, + sbuf_end, sbuffer, last); + } + } + else { + if (lxb_url_codepoint_alphanumeric[c] == 0xFF) { + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + goto failed; + } + } + + *sbuf++ = c; + if (sbuf >= sbuf_end) { + LXB_URL_SBUF_REALLOC(sbuf, sbuf_begin, sbuf_end, sbuffer, last); + } + } + + p += 1; + } + + if (count == 0 || last < sbuf) { + lxb_url_path_fix_windows_drive(url, last, sbuf, count); + count += 1; + } + + url->path.length = count; + + status = lxb_url_path_append_wo_slash(url, sbuf_begin, sbuf - sbuf_begin); + if (status != LXB_STATUS_OK) { + goto failed; + } + + if (sbuf_begin != sbuffer) { + lexbor_free(sbuf_begin); + } + + return p; + +failed: + + if (sbuf_begin != sbuffer) { + lexbor_free(sbuf_begin); + } + + return NULL; +} + +static lxb_status_t +lxb_url_path_try_dot(lxb_url_t *url, const lxb_char_t **begin, + const lxb_char_t **last, const lxb_char_t **start, + const lxb_char_t *end, bool bqs) +{ + unsigned count; + lxb_char_t c; + lexbor_str_t *str; + lxb_status_t status; + const lxb_char_t *p; + + p = *start; + count = 0; + + for (; p < end; p++) { + c = *p; + + if (c == '/' + || (c == '\\' && lxb_url_is_special(url)) + || ((c == '?' || c == '#') && bqs)) + { + break; + } + else if (c == '.') { + count += 1; + } + else if (c == '%') { + if (p + 3 <= end && p[1] == '2' && (p[2] == 'e' || p[2] == 'E')) { + count += 1; + } + else { + return LXB_STATUS_OK; + } + + p += 2; + } + else { + return LXB_STATUS_OK; + } + } + + if (count == 0 || count > 2) { + return LXB_STATUS_OK; + } + + if (*start > *begin) { + status = lxb_url_path_append(url, *begin, (*start - *begin) - 1); + if (status != LXB_STATUS_OK) { + return status; + } + } + + if (p < end) { + *start = p; + *begin = p + 1; + *last = *begin; + } + else { + *start = end - 1; + *begin = end; + *last = end; + } + + if (count == 2) { + lxb_url_path_shorten(url); + } + else if (count == 1) { + str = &url->path.str; + + if (str->length > 0 && str->data[str->length - 1] == '/') { + str->length -= 1; + str->data[str->length] = '\0'; + } + } + + return LXB_STATUS_OK; +} + +static const lxb_char_t * +lxb_url_path_dot_count(lxb_url_t *url, const lxb_char_t *p, + const lxb_char_t *end, const lxb_char_t *sbuf_begin, + lxb_char_t **sbuf, lxb_char_t **last, size_t *path_count, + bool bqs) +{ + unsigned count; + lxb_char_t c, *last_p; + const lxb_char_t *begin; + + count = 0; + begin = p; + + for (; p < end; p++) { + c = *p; + + if (c == '/' + || (c == '\\' && lxb_url_is_special(url)) + || ((c == '?' || c == '#') && bqs)) + { + break; + } + else if (c == '.') { + count += 1; + } + else if (c == '%') { + if (p + 3 <= end && p[1] == '2' && (p[2] == 'e' || p[2] == 'E')) { + count += 1; + } + else { + return begin; + } + + p += 2; + } + else { + return begin; + } + } + + if (count == 0 || count > 2) { + return begin; + } + + if (url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE + && *path_count == 1 + && lxb_url_normalized_windows_drive_letter(sbuf_begin + 1, *last - 1)) + { + return p; + } + + if (count == 2) { + if (*path_count > 0) { + *path_count -= 1; + last_p = *last - 1; + + while (last_p > sbuf_begin) { + last_p -= 1; + + if (*last_p == '/') { + *sbuf = last_p + 1; + break; + } + } + + *last = *sbuf; + } + } + + return p; +} + +static void +lxb_url_path_fix_windows_drive(lxb_url_t *url, lxb_char_t *sbuf, + const lxb_char_t *last, size_t count) +{ + if (url->scheme.type == LXB_URL_SCHEMEL_TYPE_FILE + && count == 0 + && ((last - sbuf == 3 && (last[-1] == '/')) || last - sbuf == 2) + && lxb_url_windows_drive_letter(sbuf, last)) + { + sbuf[1] = ':'; + } +} + +static lxb_status_t +lxb_url_leading_trailing(lxb_url_parser_t *parser, + const lxb_char_t **data, size_t *length) +{ + lxb_char_t c; + lxb_status_t status; + lxb_codepoint_t cp; + const lxb_char_t *p, *end, *tmp; + + p = *data; + end = p + *length; + + while (p < end) { + c = *p; + + if (c > 0x1F && c < 0x80 && c != 0x20) { + break; + } + else if (c >= 0x80) { + tmp = p; + cp = lxb_url_decode_valid_utf_8_single(&p, end); + if (cp > 0x1F && cp != LXB_URL_DECODE_ERROR) { + p = tmp; + break; + } + + continue; + } + + p += 1; + } + + if (p != *data) { + status = lxb_url_log_append(parser, *data, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + return status; + } + } + + while (end > p) { + tmp = end; + cp = lxb_url_decode_valid_utf_8_single_reverse(&end, p); + + if (cp > 0x1F && cp != 0x20 && cp != LXB_URL_DECODE_ERROR) { + end = tmp; + break; + } + } + + if (end != *data + *length) { + status = lxb_url_log_append(parser, end, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + return status; + } + } + + *data = p; + *length = end - p; + + return LXB_STATUS_OK; +} + +static const lxb_char_t * +lxb_url_remove_tab_newline(lxb_url_parser_t *parser, + const lxb_char_t *data, size_t *length) +{ + size_t len; + lxb_char_t c, *buf, *p_buf; + lxb_status_t status; + const lxb_char_t *p, *end; + + p = data; + end = data + *length; + + /* Fast path. */ + + p = lexbor_swar_seek3(p, end, '\n', '\r', '\t'); + + while (p < end) { + c = *p; + + if (c == '\n' || c == '\r' || c == '\t') { + /* Slow path. */ + goto oh_my; + } + + p += 1; + } + + return data; + +oh_my: + + status = lxb_url_log_append(parser, p, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT); + if (status != LXB_STATUS_OK) { + return NULL; + } + + buf = lexbor_malloc(*length); + if (buf == NULL) { + return NULL; + } + + p_buf = buf; + len = p - data; + p_buf += len; + + memcpy(buf, data, len); + + p += 1; + + while (p < end) { + c = *p; + + if (c == '\n' || c == '\r' || c == '\t') { + p += 1; + continue; + } + + *p_buf++ = c; + p += 1; + } + + *length = p_buf - buf; + + return buf; +} + + +static const lxb_url_scheme_data_t * +lxb_url_scheme_find(const lxb_char_t *data, size_t length) +{ + const lxb_url_scheme_data_t *schm; + + for (size_t i = LXB_URL_SCHEMEL_TYPE__UNKNOWN + 1; + i < lxb_url_scheme_res_length; i++) + { + schm = &lxb_url_scheme_res[i]; + + if (schm->name.length == length) { + if (lexbor_str_data_ncasecmp(schm->name.data, data, length)) { + return schm; + } + } + } + + return &lxb_url_scheme_res[LXB_URL_SCHEMEL_TYPE__UNKNOWN]; +} + +static lxb_status_t +lxb_url_percent_encode_after_utf_8(const lxb_char_t *data, + const lxb_char_t *end, lexbor_str_t *str, + lexbor_mraw_t *mraw, + lxb_url_map_type_t enmap, + bool space_as_plus) +{ + size_t length; + lxb_status_t status; + const lxb_char_t *p; + lxb_char_t c, *pd; + + p = data; + length = end - p; + + /* Only valid for UTF-8. */ + + while (p < end) { + if (lxb_url_map[*p++] & enmap) { + length += 2; + } + } + + status = lxb_url_str_init(str, mraw, length + 1); + if (status != LXB_STATUS_OK) { + return status; + } + + p = data; + pd = &str->data[str->length]; + + while (p < end) { + c = *p; + + if (space_as_plus && c == ' ') { + *pd++ = '+'; + } + else if (lxb_url_map[c] & enmap) { + *pd++ = '%'; + *pd++ = lexbor_str_res_char_to_two_hex_value[c][0]; + *pd++ = lexbor_str_res_char_to_two_hex_value[c][1]; + } + else { + *pd++ = c; + } + + p += 1; + } + + *pd = '\0'; + str->length += pd - &str->data[str->length]; + + return LXB_STATUS_OK; +} + +static lxb_status_t +lxb_url_host_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, lxb_url_host_t *host, + lexbor_mraw_t *mraw, lxb_url_host_opt_t opt) +{ + char *dest, *tmp; + UIDNA *idna; + UIDNAInfo pInfo; + UErrorCode errcode; + size_t size; + uint32_t ipv4, length; + lxb_char_t c; + lxb_status_t status; + lexbor_str_t *domain; + const lxb_char_t *p; + + if (data < end && *data == '[') { + if (end[-1] != ']') { + status = lxb_url_log_append(parser, &end[-1], + LXB_URL_ERROR_TYPE_IPV6_UNCLOSED); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; + } + + data += 1; + end -= 1; + + host->type = LXB_URL_HOST_TYPE_IPV6; + + return lxb_url_ipv6_parse(parser, data, end, host->u.ipv6); + } + + if (opt & LXB_URL_HOST_OPT_NOT_SPECIAL) { + return lxb_url_opaque_host_parse(parser, data, end, host, mraw); + } + + domain = &host->u.domain; + + if (opt & LXB_URL_HOST_OPT_DECODE) { + status = lxb_url_percent_decode(data, end, domain, mraw, &opt); + if (status != LXB_STATUS_OK) { + return status; + } + } + else { + status = lxb_url_str_init(domain, mraw, (end - data) + 1); + if (status != LXB_STATUS_OK) { + return status; + } + + if (opt & LXB_URL_HOST_OPT_IDNA) { + domain->length = end - data; + + memcpy(domain->data, data, domain->length); + domain->data[domain->length] = '\0'; + } + else { + (void) lexbor_str_append_lowercase(domain, mraw, data, end - data); + } + } + + if (opt & LXB_URL_HOST_OPT_IDNA) { + errcode = U_ZERO_ERROR; + + idna = uidna_openUTS46(UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ, &errcode); + if (U_FAILURE(errcode)) { + return LXB_STATUS_ERROR; + } + + size = 4096; + dest = lexbor_malloc(size); + if (dest == NULL) { + goto failed_idna; + } + + again: + + pInfo = (UIDNAInfo) UIDNA_INFO_INITIALIZER; + errcode = U_ZERO_ERROR; + + length = uidna_nameToASCII_UTF8(idna, (const char *) domain->data, + domain->length, dest, size, &pInfo, + &errcode); + if (U_FAILURE(errcode)) { + if (errcode == U_BUFFER_OVERFLOW_ERROR) { + size *= 4; + + tmp = lexbor_realloc(dest, size); + if (tmp == NULL) { + goto failed_idna; + } + + dest = tmp; + goto again; + } + + return LXB_STATUS_ERROR; + } + + lexbor_str_clean(domain); + + p = lexbor_str_append(domain, mraw, (const lxb_char_t *) dest, length); + if (p == NULL) { + goto failed_idna; + } + + lexbor_free(dest); + uidna_close(idna); + } + + if (domain->length == 0) { + return LXB_STATUS_ERROR_UNEXPECTED_DATA; + } + + p = domain->data; + end = p + domain->length; + + while (p < end) { + c = *p++; + + if (c < 128 && lxb_url_map_forbidden_domain_cp[c] != 0xff) { + status = lxb_url_log_append(parser, p - 1, + LXB_URL_ERROR_TYPE_DOMAIN_INVALID_CODE_POINT); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; + } + } + + if (lxb_url_is_ipv4(parser, domain->data, end)) { + status = lxb_url_ipv4_parse(parser, domain->data, end, &ipv4); + + (void) lexbor_str_destroy(domain, mraw, false); + + if (status != LXB_STATUS_OK) { + return status; + } + + host->u.ipv4 = ipv4; + host->type = LXB_URL_HOST_TYPE_IPV4; + + return status; + } + + host->type = LXB_URL_HOST_TYPE_DOMAIN; + + return LXB_STATUS_OK; + +failed_idna: + + if (dest) { + lexbor_free(dest); + } + + uidna_close(idna); + + return LXB_STATUS_ERROR_MEMORY_ALLOCATION; +} + +lxb_inline lxb_status_t +lxb_url_ipv4_append(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, uint64_t *ipv, + int *out_of, unsigned i) +{ + lxb_status_t status; + lxb_url_error_type_t type; + + status = lxb_url_ipv4_number_parse(data, end, &ipv[i]); + + if (status != LXB_STATUS_OK) { + if (status == LXB_STATUS_ERROR) { + type = LXB_URL_ERROR_TYPE_IPV4_NON_NUMERIC_PART; + goto failed; + } + + status = lxb_url_log_append(parser, data, + LXB_URL_ERROR_TYPE_IPV4_NON_DECIMAL_PART); + if (status != LXB_STATUS_OK) { + return status; + } + } + + if (ipv[i] > 255) { + status = lxb_url_log_append(parser, data, + LXB_URL_ERROR_TYPE_IPV4_OUT_OF_RANGE_PART); + if (status != LXB_STATUS_OK) { + return status; + } + + if (*out_of != -1) { + *out_of = (int) i; + } + } + + return LXB_STATUS_OK; + +failed: + + status = lxb_url_log_append(parser, data, type); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; +} + +static lxb_status_t +lxb_url_ipv4_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, uint32_t *ipv4) +{ + int out_of; + uint32_t ip; + uint64_t parts[5]; + unsigned i; + lxb_status_t status; + const lxb_char_t *p, *begin; + lxb_url_error_type_t type; + + static const uint64_t st[] = {0, 256, 65536, 16777216, 4294967296, 0}; + + if (data >= end) { + return LXB_STATUS_ERROR; + } + + i = 0; + p = data; + begin = data; + out_of = -1; + + /* Let's make the compiler happy. */ + + parts[0] = 0; + + while (p < end) { + if (*p == '.') { + if (i == 4) { + type = LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS; + goto failed; + } + + status = lxb_url_ipv4_append(parser, begin, p, parts, &out_of, ++i); + if (status != LXB_STATUS_OK) { + return status; + } + + begin = p + 1; + } + + p += 1; + } + + if (begin < p) { + if (i == 4) { + type = LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS; + goto failed; + } + + status = lxb_url_ipv4_append(parser, begin, p, parts, &out_of, ++i); + if (status != LXB_STATUS_OK) { + return status; + } + } + else if (p[-1] == '.') { + status = lxb_url_log_append(parser, begin, + LXB_URL_ERROR_TYPE_IPV4_EMPTY_PART); + if (status != LXB_STATUS_OK) { + return status; + } + } + + if (out_of != -1 && out_of != (int) i) { + return LXB_STATUS_ERROR_OVERFLOW; + } + + if (parts[i] >= st[5 - i]) { + return LXB_STATUS_ERROR_OVERFLOW; + } + + ip = (uint32_t) parts[i--]; + + for (unsigned j = 1; j <= i; j++) { + if (parts[j] > 255) { + return LXB_STATUS_ERROR_OVERFLOW; + } + + ip += parts[j] * st[3 - (j - 1)]; + } + + *ipv4 = ip; + + return LXB_STATUS_OK; + +failed: + + status = lxb_url_log_append(parser, begin, type); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; +} + +static lxb_status_t +lxb_url_ipv4_number_parse(const lxb_char_t *data, + const lxb_char_t *end, uint64_t *num) +{ + uint64_t n; + unsigned r; + lxb_char_t c; + lxb_status_t status; + const lxb_char_t *str_map; + + if (data >= end) { + goto failed; + } + + r = 10; + str_map = lexbor_str_res_map_num; + status = LXB_STATUS_OK; + + if (data + 1 < end) { + if (data[0] == '0') { + if ((data[1] == 'x' || data[1] == 'X')) { + data += 2; + r = 16; + str_map = lexbor_str_res_map_hex; + } + else { + data += 1; + r = 8; + str_map = lxb_url_map_num_8; + } + + if (data >= end) { + *num = 0; + return LXB_STATUS_WARNING; + } + + status = LXB_STATUS_WARNING; + } + } + + n = 0; + + while (data < end) { + c = *data++; + + if (str_map[c] == 0xff) { + goto failed; + } + + n = str_map[c] + n * r; + + if (n > UINT32_MAX) { + break; + } + } + + *num = n; + + return status; + +failed: + + *num = 0; + + return LXB_STATUS_ERROR; +} + +static bool +lxb_url_is_ipv4(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end) +{ + bool isit, first; + uint64_t num; + lxb_char_t c; + lxb_status_t status; + const lxb_char_t *p = end; + + if (data >= end) { + return false; + } + + isit = true; + first = true; + + do { + p -= 1; + c = *p; + + if (lexbor_str_res_map_hex[c] == 0xff) { + if (c == '.') { + if (p == end - 1) { + if (first) { + end = p; + first = false; + continue; + } + + return false; + } + + p += 1; + break; + } + else if (c != 'x' && c != 'X') { + return false; + } + } + + if (c < '0' || c > '9') { + isit = false; + } + } + while (p > data); + + if (p == end) { + return false; + } + + if (isit) { + return true; + } + + status = lxb_url_ipv4_number_parse(p, end, &num); + + return status != LXB_STATUS_ERROR; +} + +static lxb_status_t +lxb_url_ipv6_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, uint16_t *ipv6) +{ + size_t i, idx; + uint16_t *piece, *compress, num, swap; + lxb_char_t c; + lxb_status_t status; + const lxb_char_t *p; + lxb_url_error_type_t err_type; + + piece = ipv6; + compress = NULL; + p = data; + + if (p >= end) { + goto done; + } + + if (*p == ':') { + if (p + 1 >= end || p[1] != ':') { + p = (p + 1 >= end) ? p : &p[1]; + + err_type = LXB_URL_ERROR_TYPE_IPV6_INVALID_COMPRESSION; + goto failed; + } + + p += 2; + + piece += 1; + compress = piece; + } + + while (p < end) { + if (piece == &ipv6[8]) { + err_type = LXB_URL_ERROR_TYPE_IPV6_TOO_MANY_PIECES; + goto failed; + } + + if (*p == ':') { + if (compress != NULL) { + err_type = LXB_URL_ERROR_TYPE_IPV6_MULTIPLE_COMPRESSION; + goto failed; + } + + p += 1; + + piece += 1; + compress = piece; + + continue; + } + + num = 0; + i = 0; + + while (i < 4 && p < end) { + c = lexbor_str_res_map_hex[*p]; + if (c == 0xff) { + break; + } + + num = num << 4 | c; + + p += 1; + i += 1; + } + + if (p >= end) { + *piece++ = num; + break; + } + + if (*p == '.') { + if (i == 0) { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT; + goto failed; + } + + p -= i; + + if (piece > &ipv6[6]) { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_MANY_PIECES; + goto failed; + } + + status = lxb_url_ipv4_in_ipv6_parse(parser, &p, end, &piece); + if (status != LXB_STATUS_OK) { + return status; + } + + break; + } + + if (*p == ':') { + p += 1; + + if (p >= end) { + err_type = LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT; + goto failed; + } + } + else if (p < end) { + err_type = LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT; + goto failed; + } + + *piece++ = num; + } + +done: + + if (compress != NULL) { + num = piece - compress; + i = 7; + + while (i != 0 && num > 0) { + idx = (compress - ipv6) + num - 1; + swap = ipv6[idx]; + + ipv6[idx] = ipv6[i]; + ipv6[i] = swap; + + i -= 1; + num -= 1; + } + } + else if (piece - ipv6 != 8) { + err_type = LXB_URL_ERROR_TYPE_IPV6_TOO_FEW_PIECES; + goto failed; + } + + return LXB_STATUS_OK; + +failed: + + status = lxb_url_log_append(parser, p, err_type); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; +} + +static lxb_status_t +lxb_url_ipv4_in_ipv6_parse(lxb_url_parser_t *parser, const lxb_char_t **data, + const lxb_char_t *end, uint16_t **pieces) +{ + int16_t ipv4; + uint16_t *piece; + lxb_char_t c; + lxb_status_t status; + unsigned int seen; + const lxb_char_t *p; + lxb_url_error_type_t err_type; + + piece = *pieces; + seen = 0; + p = *data; + + while (p < end) { + ipv4 = -1; + + if (seen > 0) { + if (*p == '.' && seen < 4) { + p += 1; + + if (p >= end) { + break; + } + } + else { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT; + goto failed; + } + } + + do { + c = *p; + + if (c < '0' || c > '9') { + if (ipv4 == -1) { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT; + goto failed; + } + + break; + } + + if (ipv4 == -1) { + ipv4 = lexbor_str_res_map_num[c]; + } + else if (ipv4 == 0) { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT; + goto failed; + } + else { + ipv4 = ipv4 * 10 + lexbor_str_res_map_num[c]; + } + + if (ipv4 > 255) { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_OUT_OF_RANGE_PART; + goto failed; + } + + p += 1; + } + while (p < end); + + *piece = *piece * 0x100 + ipv4; + + seen += 1; + + if (seen == 2 || seen == 4) { + piece += 1; + } + } + + if (seen != 4) { + err_type = LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_FEW_PARTS; + goto failed; + } + + *pieces = piece; + *data = p; + + return LXB_STATUS_OK; + +failed: + + status = lxb_url_log_append(parser, p, err_type); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; +} + +static lxb_status_t +lxb_url_opaque_host_parse(lxb_url_parser_t *parser, const lxb_char_t *data, + const lxb_char_t *end, lxb_url_host_t *host, + lexbor_mraw_t *mraw) +{ + lxb_char_t c; + lxb_status_t status; + const lxb_char_t *p; + + p = data; + + while (p < end) { + c = *p++; + + if (c < 128 && lxb_url_map_forbidden_host_cp[c] != 0xff) { + status = lxb_url_log_append(parser, p - 1, + LXB_URL_ERROR_TYPE_HOST_INVALID_CODE_POINT); + if (status != LXB_STATUS_OK) { + return status; + } + + return LXB_STATUS_ERROR_UNEXPECTED_DATA; + } + } + + host->type = LXB_URL_HOST_TYPE_OPAQUE; + + return lxb_url_percent_encode_after_utf_8(data, end, &host->u.opaque, mraw, + LXB_URL_MAP_C0, false); +} + +static lxb_status_t +lxb_url_percent_decode(const lxb_char_t *data, const lxb_char_t *end, + lexbor_str_t *str, lexbor_mraw_t *mraw, + lxb_url_host_opt_t *opt) +{ + lxb_char_t c, *dp; + lxb_status_t status; + const lxb_char_t *p; + + status = lxb_url_str_init(str, mraw, (end - data) + 1); + if (status != LXB_STATUS_OK) { + return status; + } + + p = data; + dp = str->data; + + while (p < end) { + c = *p++; + + if (c != '%') { + *dp++ = c; + continue; + } + + if (p + 2 <= end && lexbor_str_res_map_hex[p[0]] != 0xff + && lexbor_str_res_map_hex[p[1]] != 0xff) + { + c = lexbor_str_res_map_hex[p[0]] << 4 | lexbor_str_res_map_hex[p[1]]; + p += 2; + + if (c >= 0x80) { + *opt |= LXB_URL_HOST_OPT_IDNA; + } + } + + *dp++ = c; + } + + *dp = '\0'; + str->length = dp - str->data; + + return LXB_STATUS_OK; +} + +void +lxb_url_erase(lxb_url_t *url) +{ + if (url == NULL) { + return; + } + + if (url->scheme.name.data != NULL) { + lexbor_str_destroy(&url->scheme.name, url->mraw, false); + } + + switch (url->host.type) { + case LXB_URL_HOST_TYPE_DOMAIN: + case LXB_URL_HOST_TYPE_OPAQUE: + lexbor_str_destroy(&url->host.u.domain, url->mraw, false); + break; + + default: + break; + } + + if (url->username.data != NULL) { + lexbor_str_destroy(&url->username, url->mraw, false); + } + + if (url->password.data != NULL) { + lexbor_str_destroy(&url->password, url->mraw, false); + } + + if (url->path.str.data != NULL) { + lexbor_str_destroy(&url->path.str, url->mraw, false); + } + + if (url->query.data != NULL) { + lexbor_str_destroy(&url->query, url->mraw, false); + } + + if (url->fragment.data != NULL) { + lexbor_str_destroy(&url->fragment, url->mraw, false); + } +} + +lxb_url_t * +lxb_url_destroy(lxb_url_t *url) +{ + if (url == NULL) { + return NULL; + } + + lxb_url_erase(url); + + return lexbor_mraw_free(url->mraw, url); +} + +void +lxb_url_memory_destroy(lxb_url_t *url) +{ + (void) lexbor_mraw_destroy(url->mraw, true); +} + +static const lxb_char_t * +lxb_url_path_part_by_index(const lxb_url_t *url, size_t index, + size_t *out_length) +{ + size_t i, length, begin; + const lxb_char_t *data; + + data = url->path.str.data; + length = url->path.str.length; + + i = 0; + begin = 0; + + while (i < length) { + if (data[i] == '/') { + if (index == 0) { + *out_length = i - begin; + return &data[begin]; + } + + index -= 1; + begin = i + 1; + } + + i += 1; + } + + *out_length = 0; + + return NULL; +} + +lxb_status_t +lxb_url_api_href_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *href, size_t length) +{ + lxb_status_t status; + lexbor_mraw_t *origin_mraw; + lxb_url_parser_t self_parser; + const lxb_char_t tmp[1] = ""; + + if (href == NULL) { + href = tmp; + length = 0; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + origin_mraw = parser->mraw; + parser->mraw = url->mraw; + + status = lxb_url_parse_basic_h(parser, NULL, NULL, href, length, + LXB_URL_STATE__UNDEF); + + parser->mraw = origin_mraw; + + if (status != LXB_STATUS_OK) { + parser->url = lxb_url_destroy(parser->url); + } + else { + lxb_url_erase(url); + + *url = *parser->url; + } + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + return status; +} + +lxb_status_t +lxb_url_api_protocol_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *protocol, size_t length) +{ + lxb_status_t status; + lxb_url_parser_t self_parser; + const lxb_char_t tmp[1] = ""; + + if (protocol == NULL) { + protocol = tmp; + length = 0; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + status = lxb_url_parse_basic_h(parser, url, NULL, protocol, length, + LXB_URL_STATE_SCHEME_START_STATE); + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + return status; +} + +lxb_status_t +lxb_url_api_username_set(lxb_url_t *url, + const lxb_char_t *username, size_t length) +{ + if (lxb_url_cannot_have_user_pass_port(url)) { + return LXB_STATUS_OK; + } + + url->username.length = 0; + + if (username == NULL || length == 0) { + lexbor_str_destroy(&url->username, url->mraw, false); + return LXB_STATUS_OK; + } + + return lxb_url_percent_encode_after_utf_8(username, username + length, + &url->username, url->mraw, + LXB_URL_MAP_USERINFO, false); +} + +lxb_status_t +lxb_url_api_password_set(lxb_url_t *url, + const lxb_char_t *password, size_t length) +{ + if (lxb_url_cannot_have_user_pass_port(url)) { + return LXB_STATUS_OK; + } + + url->password.length = 0; + + if (password == NULL || length == 0) { + lexbor_str_destroy(&url->password, url->mraw, false); + return LXB_STATUS_OK; + } + + return lxb_url_percent_encode_after_utf_8(password, password + length, + &url->password, url->mraw, + LXB_URL_MAP_USERINFO, false); +} + +lxb_status_t +lxb_url_api_host_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *host, size_t length) +{ + return lxb_url_host_set_h(url, parser, host, length, + LXB_URL_STATE_HOST_STATE); +} + +lxb_status_t +lxb_url_api_hostname_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *hostname, size_t length) +{ + return lxb_url_host_set_h(url, parser, hostname, length, + LXB_URL_STATE_HOSTNAME_STATE); +} + +static lxb_status_t +lxb_url_host_set_h(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *host, size_t length, + lxb_url_state_t override_state) +{ + lxb_status_t status; + lxb_url_host_t old; + lxb_url_parser_t self_parser; + const lxb_char_t tmp[1] = ""; + + if (url->host.type == LXB_URL_HOST_TYPE_OPAQUE) { + return LXB_STATUS_OK; + } + + if (host == NULL) { + host = tmp; + length = 0; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + old = url->host; + + memset(&url->host, 0x00, sizeof(lxb_url_host_t)); + + status = lxb_url_parse_basic_h(parser, url, NULL, host, length, + override_state); + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + if (status != LXB_STATUS_OK) { + lxb_url_host_destroy(&url->host, url->mraw); + url->host = old; + } + else { + if (override_state == LXB_URL_STATE_HOSTNAME_STATE + && url->host.type == LXB_URL_HOST_TYPE__UNDEF) + { + url->host = old; + } + else { + lxb_url_host_destroy(&old, url->mraw); + } + } + + return status; +} + +lxb_status_t +lxb_url_api_port_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *port, size_t length) +{ + lxb_status_t status; + lxb_url_parser_t self_parser; + + if (lxb_url_cannot_have_user_pass_port(url)) { + return LXB_STATUS_OK; + } + + if (port == NULL || length == 0) { + url->port = 0; + url->has_port = false; + + return LXB_STATUS_OK; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + status = lxb_url_parse_basic_h(parser, url, NULL, port, length, + LXB_URL_STATE_PORT_STATE); + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + return status; +} + +lxb_status_t +lxb_url_api_pathname_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *pathname, size_t length) +{ + lxb_status_t status; + lxb_url_parser_t self_parser; + const lxb_char_t tmp[1] = ""; + + if (url->path.opaque) { + return LXB_STATUS_OK; + } + + if (pathname == NULL) { + pathname = tmp; + length = 0; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + url->path.length = 0; + url->path.str.length = 0; + + status = lxb_url_parse_basic_h(parser, url, NULL, pathname, length, + LXB_URL_STATE_PATH_START_STATE); + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + return status; +} + +lxb_status_t +lxb_url_api_search_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *search, size_t length) +{ + lxb_status_t status; + lxb_url_parser_t self_parser; + + lexbor_str_destroy(&url->query, url->mraw, false); + + url->query.length = 0; + + if (search == NULL || length == 0) { + lexbor_str_destroy(&url->query, url->mraw, false); + return LXB_STATUS_OK; + } + + if (*search == '?') { + search += 1; + length -= 1; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + status = lxb_url_parse_basic_h(parser, url, NULL, search, length, + LXB_URL_STATE_QUERY_STATE); + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + return status; +} + +lxb_status_t +lxb_url_api_hash_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *hash, size_t length) +{ + lxb_status_t status; + lxb_url_parser_t self_parser; + + lexbor_str_destroy(&url->fragment, url->mraw, false); + + url->fragment.length = 0; + + if (hash == NULL || length == 0) { + lexbor_str_destroy(&url->fragment, url->mraw, false); + return LXB_STATUS_OK; + } + + if (*hash == '#') { + hash += 1; + length -= 1; + } + + if (parser == NULL) { + parser = &self_parser; + + parser->log = NULL; + } + + status = lxb_url_parse_basic_h(parser, url, NULL, hash, length, + LXB_URL_STATE_FRAGMENT_STATE); + + if (parser == &self_parser) { + lxb_url_parser_destroy(parser, false); + } + + return status; +} + +size_t +lxb_url_length(const lxb_url_t *url, bool exclude_fragment) +{ + size_t length; + const lexbor_str_t *str; + + static const lexbor_str_t colon_str = lexbor_str(":"); + static const lexbor_str_t dsol_str = lexbor_str("//"); + static const lexbor_str_t at_str = lexbor_str("@"); + static const lexbor_str_t dt_str = lexbor_str("/."); + static const lexbor_str_t qm_str = lexbor_str("?"); + static const lexbor_str_t hs_str = lexbor_str("#"); + + /* Scheme. */ + + length = lxb_url_scheme_length(url); + length += colon_str.length; + + /* Host. */ + + if (url->host.type != LXB_URL_HOST_TYPE__UNDEF) { + length += dsol_str.length; + + if (lxb_url_includes_credentials(url)) { + length += url->username.length; + + if (url->password.length != 0) { + length += colon_str.length; + length += url->password.length; + } + + length += at_str.length; + } + + length += lxb_url_host_length(&url->host); + + if (url->has_port) { + length += colon_str.length; + length += lxb_url_port_length(url); + } + } + else if (!url->path.opaque && url->path.str.length > 1) { + str = &url->path.str; + + if (str->data[0] == '/' && str->data[1] == '/') { + length += dt_str.length; + } + } + + length += lxb_url_path_length(&url->path); + + if (url->query.data != NULL) { + length += qm_str.length; + length += lxb_url_query_length(url); + } + + if (!exclude_fragment && url->fragment.data != NULL) { + length += hs_str.length; + length += lxb_url_fragment_length(url); + } + + return length; +} + +size_t +lxb_url_scheme_length(const lxb_url_t *url) +{ + return url->scheme.name.length; +} + +size_t +lxb_url_username_length(const lxb_url_t *url) +{ + return url->username.length; +} + +size_t +lxb_url_password_length(const lxb_url_t *url) +{ + return url->password.length; +} + +size_t +lxb_url_host_length(const lxb_url_host_t *host) +{ + switch (host->type) { + case LXB_URL_HOST_TYPE_DOMAIN: + case LXB_URL_HOST_TYPE_OPAQUE: + return host->u.domain.length; + + case LXB_URL_HOST_TYPE_IPV4: + return sizeof("000.000.000.000") - 1; + + case LXB_URL_HOST_TYPE_IPV6: + return sizeof("[FFFF:FFFF:FFFF:FFFF:FFFF:FFFF:FFFF:FFFF]") - 1; + + default: + break; + } + + return 0; +} + +static lxb_status_t +lxb_url_idna_to_unicode(const lexbor_str_t *str, + lexbor_serialize_cb_f cb, void *ctx) +{ + lxb_status_t status; + char *dest, *tmp; + UIDNA *idna; + UIDNAInfo pInfo; + UErrorCode errcode; + size_t size; + uint32_t length; + + errcode = U_ZERO_ERROR; + + idna = uidna_openUTS46(UIDNA_CHECK_BIDI|UIDNA_CHECK_CONTEXTJ, &errcode); + if (U_FAILURE(errcode)) { + return LXB_STATUS_ERROR; + } + + size = 4096; + dest = lexbor_malloc(size); + if (dest == NULL) { + goto failed_idna; + } + +again: + + pInfo = (UIDNAInfo) UIDNA_INFO_INITIALIZER; + errcode = U_ZERO_ERROR; + + length = uidna_nameToUnicodeUTF8(idna, (const char *) str->data, str->length, + dest, size, &pInfo, &errcode); + if (U_FAILURE(errcode)) { + if (errcode == U_BUFFER_OVERFLOW_ERROR) { + size *= 4; + + tmp = lexbor_realloc(dest, size); + if (tmp == NULL) { + goto failed_idna; + } + + dest = tmp; + goto again; + } + + return LXB_STATUS_ERROR; + } + + status = cb((const lxb_char_t *) dest, length, ctx); + + lexbor_free(dest); + uidna_close(idna); + + return status; + +failed_idna: + + if (dest != NULL) + lexbor_free(dest); + + uidna_close(idna); + + return LXB_STATUS_OK; +} + +size_t +lxb_url_host_unicode_length(const lxb_url_host_t *host) +{ + size_t length; + lxb_status_t status; + + if (host->type != LXB_URL_HOST_TYPE_DOMAIN + && host->type != LXB_URL_HOST_TYPE_OPAQUE) + { + return lxb_url_host_length(host); + } + + length = 0; + status = lxb_url_idna_to_unicode(&host->u.domain, + lxb_url_callback_length, &length); + + return (status == LXB_STATUS_OK) ? length : 0; +} + +static lxb_status_t +lxb_url_callback_length(const lxb_char_t *data, size_t len, void *ctx) +{ + size_t *length = ctx; + + *length += len; + + return LXB_STATUS_OK; +} + +size_t +lxb_url_port_length(const lxb_url_t *url) +{ + return (url->has_port) ? floor(log10(url->port)) + 1 : 0; +} + +size_t +lxb_url_path_length(const lxb_url_path_t *path) +{ + return path->str.length; +} + +size_t +lxb_url_query_length(const lxb_url_t *url) +{ + return url->query.length; +} + +size_t +lxb_url_fragment_length(const lxb_url_t *url) +{ + return url->fragment.length; +} + +lxb_status_t +lxb_url_serialize(const lxb_url_t *url, lexbor_serialize_cb_f cb, void *ctx, + bool exclude_fragment) +{ + lxb_status_t status; + const lexbor_str_t *str; + lxb_char_t *p; + lxb_char_t buf[LXB_URL_BUFFER_NUM_SIZE]; + + static const lexbor_str_t colon_str = lexbor_str(":"); + static const lexbor_str_t dsol_str = lexbor_str("//"); + static const lexbor_str_t at_str = lexbor_str("@"); + static const lexbor_str_t dt_str = lexbor_str("/."); + static const lexbor_str_t qm_str = lexbor_str("?"); + static const lexbor_str_t hs_str = lexbor_str("#"); + + /* Scheme. */ + + str = &url->scheme.name; + + lexbor_serialize_write(cb, str->data, str->length, ctx, status); + lexbor_serialize_write(cb, colon_str.data, colon_str.length, ctx, status); + + /* Host. */ + + if (url->host.type != LXB_URL_HOST_TYPE__UNDEF) { + lexbor_serialize_write(cb, dsol_str.data, dsol_str.length, ctx, status); + + if (lxb_url_includes_credentials(url)) { + lexbor_serialize_write(cb, url->username.data, url->username.length, + ctx, status); + + if (url->password.length != 0) { + lexbor_serialize_write(cb, colon_str.data, colon_str.length, + ctx, status); + lexbor_serialize_write(cb, url->password.data, + url->password.length, ctx, status); + } + + lexbor_serialize_write(cb, at_str.data, at_str.length, ctx, status); + } + + status = lxb_url_serialize_host(&url->host, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + if (url->has_port) { + lexbor_serialize_write(cb, colon_str.data, colon_str.length, + ctx, status); + + p = buf + lexbor_conv_int64_to_data((int64_t) url->port, + buf, LXB_URL_BUFFER_NUM_SIZE); + + lexbor_serialize_write(cb, buf, p - buf, ctx, status); + } + } + else if (!url->path.opaque && url->path.str.length > 1) { + str = &url->path.str; + + if (str->data[0] == '/' && str->data[1] == '/') { + lexbor_serialize_write(cb, dt_str.data, dt_str.length, ctx, status); + } + } + + status = lxb_url_serialize_path(&url->path, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + if (url->query.data != NULL) { + lexbor_serialize_write(cb, qm_str.data, qm_str.length, + ctx, status); + lexbor_serialize_write(cb, url->query.data, url->query.length, + ctx, status); + } + + if (!exclude_fragment && url->fragment.data != NULL) { + lexbor_serialize_write(cb, hs_str.data, hs_str.length, + ctx, status); + lexbor_serialize_write(cb, url->fragment.data, url->fragment.length, + ctx, status); + } + + return LXB_STATUS_OK; +} + +lxb_status_t +lxb_url_serialize_scheme(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx) +{ + const lexbor_str_t *str = &url->scheme.name; + + return cb(str->data, str->length, ctx); +} + +lxb_status_t +lxb_url_serialize_username(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx) +{ + if (lxb_url_includes_credentials(url)) { + return cb(url->username.data, url->username.length, ctx); + } + + return LXB_STATUS_OK; +} + +lxb_status_t +lxb_url_serialize_password(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx) +{ + if (lxb_url_includes_credentials(url)) { + return cb(url->password.data, url->password.length, ctx); + } + + return LXB_STATUS_OK; +} + +lxb_status_t +lxb_url_serialize_host(const lxb_url_host_t *host, + lexbor_serialize_cb_f cb, void *ctx) +{ + lxb_status_t status; + + static const lexbor_str_t ob_str = lexbor_str("["); + static const lexbor_str_t cb_str = lexbor_str("]"); + + switch (host->type) { + case LXB_URL_HOST_TYPE_DOMAIN: + case LXB_URL_HOST_TYPE_OPAQUE: + return cb(host->u.domain.data, host->u.domain.length, ctx); + + case LXB_URL_HOST_TYPE_IPV4: + return lxb_url_serialize_host_ipv4(host->u.ipv4, cb, ctx); + + case LXB_URL_HOST_TYPE_IPV6: + lexbor_serialize_write(cb, ob_str.data, ob_str.length, + ctx, status); + + status = lxb_url_serialize_host_ipv6(host->u.ipv6, cb, ctx); + if (status != LXB_STATUS_OK) { + return status; + } + + return cb(cb_str.data, cb_str.length, ctx); + + default: + break; + } + + return LXB_STATUS_OK; +} + +lxb_status_t +lxb_url_serialize_host_unicode(const lxb_url_host_t *host, + lexbor_serialize_cb_f cb, void *ctx) +{ + if (host->type != LXB_URL_HOST_TYPE_DOMAIN + && host->type != LXB_URL_HOST_TYPE_OPAQUE) + { + return lxb_url_serialize_host(host, cb, ctx); + } + + return lxb_url_idna_to_unicode(&host->u.domain, cb, ctx); +} + +lxb_status_t +lxb_url_serialize_host_ipv4(uint32_t ipv4, + lexbor_serialize_cb_f cb, void *ctx) +{ + lxb_char_t *p, *end; + lxb_char_t buf[LXB_URL_BUFFER_NUM_SIZE]; + + p = buf; + end = p + LXB_URL_BUFFER_NUM_SIZE; + + p += lexbor_conv_int64_to_data((uint8_t) (ipv4 >> 24), p, end - p); + *p++ = '.'; + p += lexbor_conv_int64_to_data((uint8_t) (ipv4 >> 16), p, end - p); + *p++ = '.'; + p += lexbor_conv_int64_to_data((uint8_t) (ipv4 >> 8), p, end - p); + *p++ = '.'; + p += lexbor_conv_int64_to_data((uint8_t) (ipv4), p, end - p); + + + /* By specification. */ + /* + uint32_t n; + + for (size_t i = 0; i < 4; i++) { + n = ipv4 % 256; + + p += lexbor_conv_int64_to_data((int64_t) n, p, end - p); + + if (i != 3) { + *p++ = '.'; + } + + ipv4 = (uint32_t) floor((double) ipv4 / 256.0f); + } + */ + + *p = '\0'; + + return cb(buf, p - buf, ctx); +} + +lxb_status_t +lxb_url_serialize_host_ipv6(const uint16_t *ipv6, + lexbor_serialize_cb_f cb, void *ctx) +{ + bool ignore; + size_t i, count, tmp_count; + const uint16_t *compress, *tmp_compress; + lxb_char_t *p, *end; + lxb_char_t buf[LXB_URL_BUFFER_NUM_SIZE]; + + p = buf; + end = p + LXB_URL_BUFFER_NUM_SIZE; + + count = 0; + tmp_count = (size_t) (ipv6[0] == 0); + compress = NULL; + tmp_compress = ipv6; + + for (i = 1; i < 8; i++) { + if (ipv6[i] == 0) { + if (ipv6[i - 1] == 0) { + tmp_count += 1; + } + else { + tmp_count = 1; + tmp_compress = &ipv6[i]; + } + } + else if (tmp_count > count) { + compress = tmp_compress; + count = tmp_count; + } + } + + if (tmp_count > count) { + compress = tmp_compress; + count = tmp_count; + } + + if (compress == &ipv6[1] && ipv6[0] == 0) { + compress = ipv6; + } + + if (count < 2) { + compress = NULL; + } + + ignore = false; + + for (i = 0; i < 8; i++) { + if (ignore) { + if (ipv6[i] == 0) { + continue; + } + + ignore = false; + } + + if (compress == &ipv6[i]) { + *p++ = ':'; + + if (i == 0) { + *p++ = ':'; + } + + i += count - 1; + ignore = true; + + continue; + } + + p += lexbor_conv_dec_to_hex(ipv6[i], p, end - p); + + if (i != 7) { + *p++ = ':'; + } + } + + *p = '\0'; + + return cb(buf, p - buf, ctx); +} + +lxb_status_t +lxb_url_serialize_port(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx) +{ + lxb_char_t *p; + lxb_char_t buf[LXB_URL_BUFFER_NUM_SIZE]; + + if (url->has_port) { + p = buf + lexbor_conv_int64_to_data((int64_t) url->port, + buf, LXB_URL_BUFFER_NUM_SIZE); + return cb(buf, p - buf, ctx); + } + + return LXB_STATUS_OK; +} + +lxb_status_t +lxb_url_serialize_path(const lxb_url_path_t *path, + lexbor_serialize_cb_f cb, void *ctx) +{ + return cb(path->str.data, path->str.length, ctx); +} + +lxb_status_t +lxb_url_serialize_query(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx) +{ + if (url->query.data != NULL) { + return cb(url->query.data, url->query.length, ctx); + } + + return LXB_STATUS_OK; +} + +lxb_status_t +lxb_url_serialize_fragment(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx) +{ + if (url->query.data != NULL) { + return cb(url->fragment.data, url->fragment.length, ctx); + } + + return LXB_STATUS_OK; +} + +lxb_url_t * +lxb_url_clone(lexbor_mraw_t *mraw, lxb_url_t *url) +{ + lxb_status_t status; + lxb_url_t *new_url; + + new_url = lexbor_mraw_calloc(mraw, sizeof(lxb_url_t)); + if (new_url == NULL) { + return NULL; + } + + new_url->mraw = mraw; + + status = lxb_url_scheme_copy(&url->scheme, &new_url->scheme, mraw); + if (status != LXB_STATUS_OK) { + goto failed; + } + + status = lxb_url_username_copy(&url->username, &new_url->username, mraw); + if (status != LXB_STATUS_OK) { + goto failed; + } + + status = lxb_url_password_copy(&url->password, &new_url->password, mraw); + if (status != LXB_STATUS_OK) { + goto failed; + } + + status = lxb_url_host_copy(&url->host, &new_url->host, mraw); + if (status != LXB_STATUS_OK) { + goto failed; + } + + new_url->port = url->port; + new_url->has_port = url->has_port; + + status = lxb_url_path_copy(url, new_url); + if (status != LXB_STATUS_OK) { + goto failed; + } + + status = lxb_url_query_copy(&url->query, &new_url->query, mraw); + if (status != LXB_STATUS_OK) { + goto failed; + } + + status = lxb_url_str_copy(&url->fragment, &new_url->fragment, mraw); + if (status != LXB_STATUS_OK) { + goto failed; + } + + return new_url; + +failed: + + lxb_url_destroy(new_url); + + return NULL; +} + +/* + * Code from lexbor/encoding/decode.c + */ +static lxb_codepoint_t +lxb_url_decode_valid_utf_8_single(const lxb_char_t **data, const lxb_char_t *end) +{ + lxb_codepoint_t cp; + const lxb_char_t *p = *data; + + if (*p < 0x80){ + /* 0xxxxxxx */ + + if (end - p < 1) { + *data = end; + return LXB_URL_DECODE_ERROR; + } + + cp = (lxb_codepoint_t) *p; + + (*data) += 1; + } + else if ((*p & 0xe0) == 0xc0) { + /* 110xxxxx 10xxxxxx */ + + if (end - p < 2) { + *data = end; + return LXB_URL_DECODE_ERROR; + } + + cp = (p[0] ^ (0xC0 & p[0])) << 6; + cp |= (p[1] ^ (0x80 & p[1])); + + (*data) += 2; + } + else if ((*p & 0xf0) == 0xe0) { + /* 1110xxxx 10xxxxxx 10xxxxxx */ + + if (end - p < 3) { + *data = end; + return LXB_URL_DECODE_ERROR; + } + + cp = (p[0] ^ (0xE0 & p[0])) << 12; + cp |= (p[1] ^ (0x80 & p[1])) << 6; + cp |= (p[2] ^ (0x80 & p[2])); + + (*data) += 3; + } + else if ((*p & 0xf8) == 0xf0) { + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + + if (end - p < 4) { + *data = end; + return LXB_URL_DECODE_ERROR; + } + + cp = (p[0] ^ (0xF0 & p[0])) << 18; + cp |= (p[1] ^ (0x80 & p[1])) << 12; + cp |= (p[2] ^ (0x80 & p[2])) << 6; + cp |= (p[3] ^ (0x80 & p[3])); + + (*data) += 4; + } + else { + (*data)++; + + return LXB_URL_DECODE_ERROR; + } + + return cp; +} + +static lxb_codepoint_t +lxb_url_decode_valid_utf_8_single_reverse(const lxb_char_t **end, + const lxb_char_t *begin) +{ + lxb_codepoint_t cp; + const lxb_char_t *p = *end; + + while (p > begin) { + p -= 1; + + if (*p < 0x80){ + cp = (lxb_codepoint_t) *p; + + (*end) = p; + return cp; + } + else if ((*p & 0xe0) == 0xc0) { + /* 110xxxxx 10xxxxxx */ + + if (*end - p < 2) { + *end = p; + return LXB_URL_DECODE_ERROR; + } + + cp = (p[0] ^ (0xC0 & p[0])) << 6; + cp |= (p[1] ^ (0x80 & p[1])); + + (*end) = p; + return cp; + } + else if ((*p & 0xf0) == 0xe0) { + /* 1110xxxx 10xxxxxx 10xxxxxx */ + + if (*end - p < 3) { + *end = p; + return LXB_URL_DECODE_ERROR; + } + + cp = (p[0] ^ (0xE0 & p[0])) << 12; + cp |= (p[1] ^ (0x80 & p[1])) << 6; + cp |= (p[2] ^ (0x80 & p[2])); + + (*end) = p; + return cp; + } + else if ((*p & 0xf8) == 0xf0) { + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + + if (*end - p < 4) { + *end = p; + return LXB_URL_DECODE_ERROR; + } + + cp = (p[0] ^ (0xF0 & p[0])) << 18; + cp |= (p[1] ^ (0x80 & p[1])) << 12; + cp |= (p[2] ^ (0x80 & p[2])) << 6; + cp |= (p[3] ^ (0x80 & p[3])); + + (*end) = p; + return cp; + } + else if (*end - p >= 4) { + break; + } + } + + *end = p; + + return LXB_URL_DECODE_ERROR; +} diff --git a/contrib/url/lexbor/url/url.h b/contrib/url/lexbor/url/url.h new file mode 100644 index 0000000000..fba1840369 --- /dev/null +++ b/contrib/url/lexbor/url/url.h @@ -0,0 +1,581 @@ +/* + * Copyright (C) 2023 Alexander Borisov + * + * Author: Alexander Borisov + * + * The URL Standard. + * By specification: https://url.spec.whatwg.org/ + */ + +#ifndef LEXBOR_URL_H +#define LEXBOR_URL_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "lexbor/url/base.h" +#include "lexbor/core/mraw.h" +#include "lexbor/core/plog.h" + + +typedef enum { + LXB_URL_ERROR_TYPE_DOMAIN_TO_ASCII = 0x00, + LXB_URL_ERROR_TYPE_DOMAIN_TO_UNICODE, + LXB_URL_ERROR_TYPE_DOMAIN_INVALID_CODE_POINT, + LXB_URL_ERROR_TYPE_HOST_INVALID_CODE_POINT, + LXB_URL_ERROR_TYPE_IPV4_EMPTY_PART, + LXB_URL_ERROR_TYPE_IPV4_TOO_MANY_PARTS, + LXB_URL_ERROR_TYPE_IPV4_NON_NUMERIC_PART, + LXB_URL_ERROR_TYPE_IPV4_NON_DECIMAL_PART, + LXB_URL_ERROR_TYPE_IPV4_OUT_OF_RANGE_PART, + LXB_URL_ERROR_TYPE_IPV6_UNCLOSED, + LXB_URL_ERROR_TYPE_IPV6_INVALID_COMPRESSION, + LXB_URL_ERROR_TYPE_IPV6_TOO_MANY_PIECES, + LXB_URL_ERROR_TYPE_IPV6_MULTIPLE_COMPRESSION, + LXB_URL_ERROR_TYPE_IPV6_INVALID_CODE_POINT, + LXB_URL_ERROR_TYPE_IPV6_TOO_FEW_PIECES, + LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_MANY_PIECES, + LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_INVALID_CODE_POINT, + LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_OUT_OF_RANGE_PART, + LXB_URL_ERROR_TYPE_IPV4_IN_IPV6_TOO_FEW_PARTS, + LXB_URL_ERROR_TYPE_INVALID_URL_UNIT, + LXB_URL_ERROR_TYPE_SPECIAL_SCHEME_MISSING_FOLLOWING_SOLIDUS, + LXB_URL_ERROR_TYPE_MISSING_SCHEME_NON_RELATIVE_URL, + LXB_URL_ERROR_TYPE_INVALID_REVERSE_SOLIDUS, + LXB_URL_ERROR_TYPE_INVALID_CREDENTIALS, + LXB_URL_ERROR_TYPE_HOST_MISSING, + LXB_URL_ERROR_TYPE_PORT_OUT_OF_RANGE, + LXB_URL_ERROR_TYPE_PORT_INVALID, + LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER, + LXB_URL_ERROR_TYPE_FILE_INVALID_WINDOWS_DRIVE_LETTER_HOST, + LXB_URL_ERROR_TYPE__LAST_ENTRY +} +lxb_url_error_type_t; + +typedef enum { + LXB_URL_STATE__UNDEF = 0x00, + LXB_URL_STATE_SCHEME_START_STATE, + LXB_URL_STATE_SCHEME_STATE, + LXB_URL_STATE_NO_SCHEME_STATE, + LXB_URL_STATE_SPECIAL_RELATIVE_OR_AUTHORITY_STATE, + LXB_URL_STATE_PATH_OR_AUTHORITY_STATE, + LXB_URL_STATE_RELATIVE_STATE, + LXB_URL_STATE_RELATIVE_SLASH_STATE, + LXB_URL_STATE_SPECIAL_AUTHORITY_SLASHES_STATE, + LXB_URL_STATE_SPECIAL_AUTHORITY_IGNORE_SLASHES_STATE, + LXB_URL_STATE_AUTHORITY_STATE, + LXB_URL_STATE_HOST_STATE, + LXB_URL_STATE_HOSTNAME_STATE, + LXB_URL_STATE_PORT_STATE, + LXB_URL_STATE_FILE_STATE, + LXB_URL_STATE_FILE_SLASH_STATE, + LXB_URL_STATE_FILE_HOST_STATE, + LXB_URL_STATE_PATH_START_STATE, + LXB_URL_STATE_PATH_STATE, + LXB_URL_STATE_OPAQUE_PATH_STATE, + LXB_URL_STATE_QUERY_STATE, + LXB_URL_STATE_FRAGMENT_STATE +} +lxb_url_state_t; + +/* + * New values can only be added downwards. + * Before LXB_URL_SCHEMEL_TYPE__LAST_ENTRY. + * + * Please, see lxb_url_scheme_res in /lexbor/url/url.c. + */ +typedef enum { + LXB_URL_SCHEMEL_TYPE__UNDEF = 0x00, + LXB_URL_SCHEMEL_TYPE__UNKNOWN = 0x01, + LXB_URL_SCHEMEL_TYPE_HTTP = 0x02, + LXB_URL_SCHEMEL_TYPE_HTTPS = 0x03, + LXB_URL_SCHEMEL_TYPE_WS = 0x04, + LXB_URL_SCHEMEL_TYPE_WSS = 0x05, + LXB_URL_SCHEMEL_TYPE_FTP = 0x06, + LXB_URL_SCHEMEL_TYPE_FILE = 0x07, + LXB_URL_SCHEMEL_TYPE__LAST_ENTRY +} +lxb_url_scheme_type_t; + +typedef struct { + const lexbor_str_t name; + uint16_t port; + lxb_url_scheme_type_t type; +} +lxb_url_scheme_data_t; + +typedef struct { + lexbor_str_t name; + lxb_url_scheme_type_t type; +} +lxb_url_scheme_t; + +typedef enum { + LXB_URL_HOST_TYPE__UNDEF = 0x00, + LXB_URL_HOST_TYPE_DOMAIN = 0x01, + LXB_URL_HOST_TYPE_OPAQUE = 0x02, + LXB_URL_HOST_TYPE_IPV4 = 0x03, + LXB_URL_HOST_TYPE_IPV6 = 0x04, + LXB_URL_HOST_TYPE_EMPTY = 0x05 +} +lxb_url_host_type_t; + +typedef struct { + lxb_url_host_type_t type; + + union { + uint16_t ipv6[8]; + uint32_t ipv4; + lexbor_str_t opaque; + lexbor_str_t domain; + } u; +} +lxb_url_host_t; + +typedef struct { + lexbor_str_t str; + size_t length; + bool opaque; +} +lxb_url_path_t; + +typedef struct { + lxb_url_scheme_t scheme; + + lxb_url_host_t host; + + lexbor_str_t username; + lexbor_str_t password; + + uint16_t port; + bool has_port; + + lxb_url_path_t path; + + lexbor_str_t query; + lexbor_str_t fragment; + + lexbor_mraw_t *mraw; +} +lxb_url_t; + +typedef struct { + lxb_url_t *url; + lexbor_mraw_t *mraw; + lexbor_plog_t *log; +} +lxb_url_parser_t; + + +/* + * Create lxb_url_parser_t object. + * + * @return lxb_url_parser_t * if successful, otherwise NULL. + */ +LXB_API lxb_url_parser_t * +lxb_url_parser_create(void); + +/* + * Initialization of lxb_url_parser_t object. + * + * The parser is not bound to the received URLs in any way. That is, after + * parsing the lxb_url_parser_t object can be destroyed and we can continue + * working with the received URLs. + * + * Memory for created URLs is taken from lexbor_mraw_t object, which you can + * pass during initialization of lxb_url_parser_t object, or a new lexbor_mraw_t + * object will be created during initialization if NULL is passed. + * + * Each created URL will have a pointer to the lexbor_mraw_t object. + * + * By destroying the lexbor_mraw_t object you destroy all the URL objects + * created by the parser. Use the lxb_url_destroy() function to destroy a + * specific URL. + * + * Destroying the lxb_url_parser_t object with lxb_url_parser_destroy() does + * not destroy the lexbor_mraw_t memory object. + * + * Please, see functions lxb_url_parser_memory_destroy(), lxb_url_destroy(), + * lxb_url_memory_destroy(). + * + * @param[in] lxb_url_parser_t * + * @param[in] lexbor_mraw_t *. Can be NULL. If pass NULL, it will create its own + * memory object inside parser and it will be bound to all created URLs. + * + * @return LXB_STATUS_OK if successful, otherwise an error status value. + */ +LXB_API lxb_status_t +lxb_url_parser_init(lxb_url_parser_t *parser, lexbor_mraw_t *mraw); + +/* + * Clears the object. Returns object to states as after initialization. + * + * This function must be called before the parsing functions can be reused. + * + * For example: + * lxb_url_parse() + * lxb_url_parser_clean() + * lxb_url_parse() + * lxb_url_destroy() + * + * @param[in] lxb_url_parser_t * + */ +LXB_API void +lxb_url_parser_clean(lxb_url_parser_t *parser); + +/* + * Destroy lxb_url_parser_t object. + * + * Release of occupied resources. + * The lexbor_mraw_t memory object is not destroyed in this function. + * + * @param[in] lxb_url_parser_t *. Can be NULL. + * @param[in] if false: only destroys internal buffers. + * if true: destroys the lxb_url_parser_t object and all internal buffers. + * + * @return lxb_url_parser_t * if self_destroy = false, otherwise NULL. + */ +LXB_API lxb_url_parser_t * +lxb_url_parser_destroy(lxb_url_parser_t *parser, bool destroy_self); + +/* + * Destroys the lexbor_mraw_t object, and thus all associated URLs. + * + * After that, new URLs cannot be parsed until a new lexbor_mraw_t object is + * assigned to the lxb_url_parser_t object. + * + * @param[in] lxb_url_parser_t *. + */ +LXB_API void +lxb_url_parser_memory_destroy(lxb_url_parser_t *parser); + +/* + * URL parser. + * + * This functional an implementation of URL parsing according to the WHATWG + * specification. + * + * @param[in] lxb_url_parser_t *. + * @param[in] const lxb_url_t *. Base URL, can be NULL. + * @param[in] Input characters. Not NULL. + * @param[in] Length of characters. Can be 0. + * + * @return lxb_url_t * if successful, otherwise NULL. + */ +LXB_API lxb_url_t * +lxb_url_parse(lxb_url_parser_t *parser, const lxb_url_t *base_url, + const lxb_char_t *data, size_t length); + +/* + * URL basic parser. + * + * This functional an implementation of URL parsing according to the WHATWG + * specification. + * + * Use the lxb_url_get() function to get the URL object. + * + * @param[in] lxb_url_parser_t *. + * @param[in] lxb_url_t *. Can be NULL. + * @param[in] const lxb_url_t *. Base URL, can be NULL. + * @param[in] Input characters. Not NULL. + * @param[in] Length of characters. Can be 0. + * @param[in] lxb_url_state_t, for default set to LXB_URL_STATE__UNDEF. + * @param[in] lxb_encoding_t, default (LXB_ENCODING_DEFAULT) LXB_ENCODING_UTF_8. + * + * @return LXB_STATUS_OK if successful, otherwise an error status value. + */ +LXB_API lxb_status_t +lxb_url_parse_basic(lxb_url_parser_t *parser, lxb_url_t *url, + const lxb_url_t *base_url, + const lxb_char_t *data, size_t length, + lxb_url_state_t override_state); + +/* + * Erase URL. + * + * Frees all internal memory occupied by the URL object, but does not destroy + * the object. + * + * @param[in] lxb_url_t *. + * + * @return NULL. + */ +LXB_API void +lxb_url_erase(lxb_url_t *url); + +/* + * Destroys URL. + * + * @param[in] lxb_url_t *. + * + * @return NULL. + */ +LXB_API lxb_url_t * +lxb_url_destroy(lxb_url_t *url); + +/* + * Destroys the lexbor_mraw_t memory object. + * + * The function will destroy all URLs associated with the lexbor_mraw_t memory + * object, including the passed one. + * + * Keep in mind, if you have a live lxb_url_parser_t parsing object, you will + * have a pointer to garbage after calling this function instead of a pointer + * to the lexbor_mraw_t object. + * In this case you need to assign a new memory object lexbor_mraw_t for the + * parser. Use the lxb_url_mraw_set() function. + * + * @param[in] lxb_url_t *. + */ +LXB_API void +lxb_url_memory_destroy(lxb_url_t *url); + + +/* + * Below is an API for modifying the URL object according to the + * https://url.spec.whatwg.org/#api specification. + * + * It is not necessary to pass the lxb_url_parser_t object to API functions. + * You need to pass the parser if you want to have logs of parsing. + * + * All API functions can be passed NULL as "const lxb_char_t *" data. + */ + +LXB_API lxb_status_t +lxb_url_api_href_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *href, size_t length); + +LXB_API lxb_status_t +lxb_url_api_protocol_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *protocol, size_t length); + +LXB_API lxb_status_t +lxb_url_api_username_set(lxb_url_t *url, + const lxb_char_t *username, size_t length); + +LXB_API lxb_status_t +lxb_url_api_password_set(lxb_url_t *url, + const lxb_char_t *password, size_t length); + +LXB_API lxb_status_t +lxb_url_api_host_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *host, size_t length); + +LXB_API lxb_status_t +lxb_url_api_hostname_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *hostname, size_t length); + +LXB_API lxb_status_t +lxb_url_api_port_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *port, size_t length); + +LXB_API lxb_status_t +lxb_url_api_pathname_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *pathname, size_t length); + +LXB_API lxb_status_t +lxb_url_api_search_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *search, size_t length); + +LXB_API lxb_status_t +lxb_url_api_hash_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *hash, size_t length); + + +/* + * Length of. + */ + +LXB_API size_t +lxb_url_length(const lxb_url_t *url, bool exclude_fragment); + +LXB_API size_t +lxb_url_scheme_length(const lxb_url_t *url); + +LXB_API size_t +lxb_url_username_length(const lxb_url_t *url); + +LXB_API size_t +lxb_url_password_length(const lxb_url_t *url); + +LXB_API size_t +lxb_url_host_length(const lxb_url_host_t *host); + +LXB_API size_t +lxb_url_host_unicode_length(const lxb_url_host_t *host); + +LXB_API size_t +lxb_url_port_length(const lxb_url_t *url); + +LXB_API size_t +lxb_url_path_length(const lxb_url_path_t *path); + +LXB_API size_t +lxb_url_query_length(const lxb_url_t *url); + +LXB_API size_t +lxb_url_fragment_length(const lxb_url_t *url); + + +/* + * Below are functions for serializing a URL object and its individual + * parameters. + * + * Note that the callback may be called more than once. + * For example, the lxb_url_serialize() function will callback multiple times: + * 1. http + * 2. :// + * 3. example.com + * and so on. + */ + +LXB_API lxb_status_t +lxb_url_serialize(const lxb_url_t *url, lexbor_serialize_cb_f cb, void *ctx, + bool exclude_fragment); + +LXB_API lxb_status_t +lxb_url_serialize_scheme(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_username(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_password(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_host(const lxb_url_host_t *host, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_host_unicode(const lxb_url_host_t *host, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_host_ipv4(uint32_t ipv4, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_host_ipv6(const uint16_t *ipv6, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_port(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_path(const lxb_url_path_t *path, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_query(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx); + +LXB_API lxb_status_t +lxb_url_serialize_fragment(const lxb_url_t *url, + lexbor_serialize_cb_f cb, void *ctx); + +/* + * Creates a clone of the object's URL. + * + * For lexbor_mraw_t *, use url->mraw or another lexbor_mraw_t * object. + * + * @param[in] lexbor_mraw_t *. + * @param[in] lxb_url_t *. + * + * @return a new URL object if successful, otherwise NULL value. + */ +LXB_API lxb_url_t * +lxb_url_clone(lexbor_mraw_t *mraw, lxb_url_t *url); + +/* + * Inline functions. + */ + +lxb_inline const lexbor_str_t * +lxb_url_scheme(const lxb_url_t *url) +{ + return &url->scheme.name; +} + +lxb_inline const lexbor_str_t * +lxb_url_username(const lxb_url_t *url) +{ + return &url->username; +} + +lxb_inline const lexbor_str_t * +lxb_url_password(const lxb_url_t *url) +{ + return &url->password; +} + +lxb_inline const lxb_url_host_t * +lxb_url_host(const lxb_url_t *url) +{ + return &url->host; +} + +lxb_inline uint16_t +lxb_url_port(const lxb_url_t *url) +{ + return url->port; +} + +lxb_inline bool +lxb_url_has_port(const lxb_url_t *url) +{ + return url->has_port; +} + +lxb_inline const lxb_url_path_t * +lxb_url_path(const lxb_url_t *url) +{ + return &url->path; +} + +lxb_inline const lexbor_str_t * +lxb_url_path_str(const lxb_url_t *url) +{ + return &url->path.str; +} + +lxb_inline const lexbor_str_t * +lxb_url_query(const lxb_url_t *url) +{ + return &url->query; +} + +lxb_inline const lexbor_str_t * +lxb_url_fragment(const lxb_url_t *url) +{ + return &url->fragment; +} + +lxb_inline lexbor_mraw_t * +lxb_url_mraw(lxb_url_parser_t *parser) +{ + return parser->mraw; +} + +lxb_inline void +lxb_url_mraw_set(lxb_url_parser_t *parser, lexbor_mraw_t *mraw) +{ + parser->mraw = mraw; +} + +lxb_inline lxb_url_t * +lxb_url_get(lxb_url_parser_t *parser) +{ + return parser->url; +} + + +#ifdef __cplusplus +} /* extern "C" */ +#endif + +#endif /* LEXBOR_URL_H */ diff --git a/contrib/url/meson.build b/contrib/url/meson.build new file mode 100644 index 0000000000..ad1d1e6d24 --- /dev/null +++ b/contrib/url/meson.build @@ -0,0 +1,52 @@ + +if not icu.found() + subdir_done() +endif + +url_sources = files( + 'url.c', + 'lexbor/core/array.c', + 'lexbor/core/array_obj.c', + 'lexbor/core/bst.c', + 'lexbor/core/conv.c', + 'lexbor/core/diyfp.c', + 'lexbor/core/dobject.c', + 'lexbor/core/dtoa.c', + 'lexbor/core/mem.c', + 'lexbor/core/memory.c', + 'lexbor/core/mraw.c', + 'lexbor/core/plog.c', + 'lexbor/core/str.c', + 'lexbor/core/strtod.c', + 'lexbor/url/url.c', +) + +if host_system == 'windows' + ltree_sources += rc_lib_gen.process(win32ver_rc, extra_args: [ + '--NAME', 'url', + '--FILEDESC', 'url - Uniform Resource Locator data type',]) +endif + +url = shared_module('url', + url_sources, + include_directories: include_directories('.'), + kwargs: contrib_mod_args, +) +contrib_targets += url + +install_data( + 'url.control', + 'url--1.0.sql', + kwargs: contrib_data_args, +) + +tests += { + 'name': 'url', + 'sd': meson.current_source_dir(), + 'bd': meson.current_build_dir(), + 'regress': { + 'sql': [ + 'url' + ], + }, +} diff --git a/contrib/url/sql/url.sql b/contrib/url/sql/url.sql new file mode 100644 index 0000000000..fe5e8d04e2 --- /dev/null +++ b/contrib/url/sql/url.sql @@ -0,0 +1,126 @@ +-- +-- Basic URL tests for the behavior of functions. +-- The tests for compliance with the specification are located separately. +-- + +-- The tests are designed for a UTF-8 database. Skip otherwise. +SELECT getdatabaseencoding() NOT IN ('UTF8') + AS skip_test \gset +\if :skip_test + \quit +\endif + +SELECT getdatabaseencoding(); -- label the results files + +CREATE EXTENSION url; + +-- Getters +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).scheme; -- OK, https +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).username; -- OK, root +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).password; -- OK, qwerty +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).host; -- OK, example.com +select ('https://root:qwerty@εxαmπle.cθm:8080/path/to/home?abc=xyz#anchor'::url).host; -- OK, xn--xmle-0ldw4f.xn--cm-x9b +select ('https://root:qwerty@εxαmπle.cθm:8080/path/to/home?abc=xyz#anchor'::url).host_unicode; -- OK, εxαmπle.cθm +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).port; -- OK, 8080 +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).path; -- OK, /path/to/home +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).query; -- OK, abc=xyz +select ('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url).fragment; -- OK, anchor + +-- Setters + +select url_scheme_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'wss'); -- OK +select url_username_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'guest'); -- OK +select url_password_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '12345'); -- OK +select url_host_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'postgresql.org'); -- OK +select url_port_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '80'); -- OK +select url_path_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '/docs/books/'); -- OK +select url_query_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'xyz=abc'); -- OK +select url_fragment_set('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, 'general_questions'); -- OK + +-- Base +select url_base(NULL::url, NULL); -- OK, NULL +select url_base('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, NULL); -- OK +select url_base('https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor'::url, '/change/path'); -- OK + +-- Unicode +select ('https://εxαmπle.cθm/'::url).host; -- OK, xn--xmle-0ldw4f.xn--cm-x9b +select ('https://εxαmπle.cθm/'::url).host_unicode; -- OK, εxαmπle.cθm + +-- Percent Encode +select ('https://βεst@example.com'::url).username; -- OK +select ('https://:παssφord@example.com'::url).password; -- OK +select ('https://example.com/pαth/to/hθmε'::url).path; -- OK +select ('https://xample.com/?αβγ=χψω'::url).query; -- OK +select ('https://xample.com/#αnchθrΩ'::url).fragment; -- OK + +-- Getters Ok, Error +select ''::url; -- ERROR +select NULL::url; -- OK + +select (NULL::url).scheme; -- OK +select ('file://path/to'::url).scheme; -- OK +select ('/bad/url'::url).scheme; -- ERROR + +select (NULL::url).username; -- OK +select ('https://example.com'::url).username; -- OK + +select (NULL::url).password; -- OK +select ('https://example.com'::url).password; -- OK + +select (NULL::url).host; -- OK +select ('file://host/to'::url).host; -- OK +select ('file:/path/to'::url).host; -- OK +select (NULL::url).host_unicode; -- OK +select ('file://host/to'::url).host_unicode; -- OK +select ('file:/path/to'::url).host_unicode; -- OK + +select (NULL::url).port; -- OK +select ('https://example.com'::url).port; -- OK + +select (NULL::url).path; -- OK +select ('https://example.com'::url).path; -- OK +select ('file:/path/to'::url).path; -- OK + +select (NULL::url).query; -- OK +select ('https://example.com'::url).query; -- OK + +select (NULL::url).fragment; -- OK +select ('https://example.com'::url).fragment; -- OK + +-- Setters Ok, Error +select url_scheme_set('https://example.com'::url, NULL); -- ERROR +select url_scheme_set('https://example.com'::url, ''); -- ERROR +select url_scheme_set('https://example.com'::url, '---+'); -- ERROR + +select url_username_set('https://root:qwerty@example.com'::url, NULL); -- OK +select url_username_set('https://root:qwerty@example.com'::url, ''); -- OK +select url_username_set('https://root:qwerty@example.com'::url, 'αβγ'); -- OK + +select url_password_set('https://root:qwerty@example.com'::url, NULL); -- OK +select url_password_set('https://root:qwerty@example.com'::url, ''); -- OK +select url_password_set('https://root:qwerty@example.com'::url, 'αβγ'); -- OK + +select url_port_set('https://example.com:8080'::url, NULL); -- OK +select url_port_set('https://example.com:8080'::url, ''); -- OK +select url_port_set('https://example.com:8080'::url, '80'); -- OK +select url_port_set('https://example.com:8080'::url, '123456'); -- ERROR +select url_port_set('https://example.com:8080'::url, 80); -- OK +select url_port_set('https://example.com:8080'::url, 123456); -- ERROR + +select url_host_set('https://example.com'::url, NULL); -- ERROR +select url_host_set('https://example.com'::url, ''); -- ERROR +select url_host_set('https://example.com'::url, '123'); -- OK +select url_host_set('https://example.com'::url, 'αβγ'); -- OK + +select url_path_set('https://example.com/path/to/home'::url, NULL); -- OK +select url_path_set('https://example.com/path/to/home'::url, ''); -- OK +select url_path_set('https://example.com/path/to/home'::url, '/'); -- OK +select url_path_set('https://example.com/path/to/home'::url, 'αβγ'); -- OK + +select url_query_set('https://example.com?abc=xyz'::url, NULL); -- OK +select url_query_set('https://example.com?abc=xyz'::url, ''); -- OK +select url_query_set('https://example.com?abc=xyz'::url, 'αβγ'); -- OK + +select url_fragment_set('https://example.com#anchor'::url, NULL); -- OK +select url_fragment_set('https://example.com#anchor'::url, ''); -- OK +select url_fragment_set('https://example.com#anchor'::url, 'αβγ'); -- OK diff --git a/contrib/url/url--1.0.sql b/contrib/url/url--1.0.sql new file mode 100644 index 0000000000..9521bbc2b8 --- /dev/null +++ b/contrib/url/url--1.0.sql @@ -0,0 +1,141 @@ +CREATE TYPE url; + + +CREATE FUNCTION url_in(cstring) RETURNS url + IMMUTABLE + STRICT + LANGUAGE C + AS '$libdir/url'; + +CREATE FUNCTION url_out(url) RETURNS cstring + IMMUTABLE + STRICT + LANGUAGE C + AS '$libdir/url'; + +CREATE TYPE url ( + INTERNALLENGTH = -1, + INPUT = url_in, + OUTPUT = url_out +); + + +CREATE CAST (url AS text) WITH INOUT AS ASSIGNMENT; +CREATE CAST (text AS url) WITH INOUT AS ASSIGNMENT; + + +CREATE FUNCTION scheme(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_scheme'; + +CREATE FUNCTION username(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_username'; + +CREATE FUNCTION password(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_password'; + +CREATE FUNCTION port(url) RETURNS integer + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_port'; + +CREATE FUNCTION host(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_host'; + +CREATE FUNCTION host_unicode(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_host_unicode'; + +CREATE FUNCTION path(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_path'; + +CREATE FUNCTION query(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_query'; + +CREATE FUNCTION fragment(url) RETURNS text + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_fragment'; + +CREATE FUNCTION to_url(text) RETURNS url + IMMUTABLE + STRICT + LANGUAGE C + AS 'MODULE_PATHNAME','url_create'; + +CREATE FUNCTION url_base(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_base'; + + +CREATE FUNCTION url_scheme_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_scheme_set'; + +CREATE FUNCTION url_username_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_username_set'; + +CREATE FUNCTION url_password_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_password_set'; + +CREATE FUNCTION url_host_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_host_set'; + +CREATE FUNCTION url_hostname_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_hostname_set'; + +CREATE FUNCTION url_port_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_port_set'; + +CREATE FUNCTION url_port_set(url, integer) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_port_num_set'; + +CREATE FUNCTION url_path_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_path_set'; + +CREATE FUNCTION url_query_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_query_set'; + +CREATE FUNCTION url_fragment_set(url, text) RETURNS url + IMMUTABLE + LANGUAGE C + AS 'MODULE_PATHNAME','url_fragment_set'; diff --git a/contrib/url/url.c b/contrib/url/url.c new file mode 100644 index 0000000000..fa2f1b82f1 --- /dev/null +++ b/contrib/url/url.c @@ -0,0 +1,994 @@ +/*------------------------------------------------------------------------- + * url/url.c + * + * By the WHATWG URL specification https://url.spec.whatwg.org/. + * Supports Unicode ToASCII, ToUnicode. + * + * A complete URL entry consists of the following parts: + * + * https://root:qwerty@example.com:8080/path/to/home?abc=xyz#anchor + * |___| |__| |____| |_________| |__||___________| |_____| |____| + * | | | | | | | | + * scheme | password | port | query | + * | | | | + * username host path fragment + * + * + * Functions to get separate parts of a URL: + * scheme, username, password, host, host_unicode, port, path, query, + * fragment. + * + * Example: + * SELECT ('https://example.com/'::url).host; + * Result: + * example.com + * + * + * Functions to set separate parts of a URL: + * url_scheme_set, url_username_set, url_password_set, url_host_set, + * url_hostname_set, url_port_set, url_path_set, url_query_set, + * url_fragment_set. + * + * Example: + * SELECT url_host_set('https://example.com/'::url, 'postgresql.org'); + * Result: + * https://postgresql.org/ + * + * All URL modification functions return the full modified URL. + * All URL functions will return NULL if a NULL value is passed as the URL. + * + * + * The url_base() function: + * The function allows to create a new URL based on the base URL and + * relative URL. + * + * Example: + * SELECT url_base('https://example.com/path/to'::url, '/new/path#and-fragment'); + * SELECT url_base('https://example.com/path/to'::url, 'wss://postgresql.org/new/path#and-fragment'); + * SELECT url_base('https://example.com/path/to/home/'::url, 'world'); + * SELECT url_base('https://example.com/path/to/home/'::url, '../world'); + * Result: + * https://example.com/new/path#and-fragment + * wss://postgresql.org/new/path#and-fragment + * https://example.com/path/to/home/world + * https://example.com/path/to/world + * + * More information about functions can be found in the README. + * + *------------------------------------------------------------------------- + */ +#include +#include +#include +#include + +#include "lexbor/url/url.h" + + +PG_MODULE_MAGIC; + + +#define PG_RETURN_URL_P(p) PG_RETURN_POINTER(p) +#define PG_GETARG_URL_P(n) PG_DETOAST_DATUM(PG_GETARG_DATUM(n)) + +#define URL_LXB_STR_ARGS(str) (const char *) (str)->data, (str)->length +#define URL_VARDATA(vardata) ((uint8_t *) VARDATA(vardata)) +#define URL_HEAD_SIZE (URL_LAST_ENTRY * sizeof(uint32_t)) + + +typedef enum URLIndex +{ + URL_SCHEME = 0x00, + URL_USERNAME, + URL_PASSWORD, + URL_HOST, + URL_PATH, + URL_QUERY, + URL_FRAGMENT, + URL_SUM, + URL_PORT, + URL_LAST_ENTRY +} URLIndex; + +typedef struct URLCallbackContext +{ + char *result; + size_t length; +} URLCallbackContext; + +typedef struct varlena URL; + +typedef lxb_status_t +(*URLSetFunc)(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *data, size_t length); + + +/* + * External dynamically-loaded functions. + */ +PG_FUNCTION_INFO_V1(url_in); +PG_FUNCTION_INFO_V1(url_out); + +PG_FUNCTION_INFO_V1(url_scheme); +PG_FUNCTION_INFO_V1(url_username); +PG_FUNCTION_INFO_V1(url_password); +PG_FUNCTION_INFO_V1(url_host); +PG_FUNCTION_INFO_V1(url_host_unicode); +PG_FUNCTION_INFO_V1(url_port); +PG_FUNCTION_INFO_V1(url_path); +PG_FUNCTION_INFO_V1(url_query); +PG_FUNCTION_INFO_V1(url_fragment); +PG_FUNCTION_INFO_V1(url_create); +PG_FUNCTION_INFO_V1(url_base); + +PG_FUNCTION_INFO_V1(url_scheme_set); +PG_FUNCTION_INFO_V1(url_username_set); +PG_FUNCTION_INFO_V1(url_password_set); +PG_FUNCTION_INFO_V1(url_host_set); +PG_FUNCTION_INFO_V1(url_hostname_set); +PG_FUNCTION_INFO_V1(url_port_set); +PG_FUNCTION_INFO_V1(url_port_num_set); +PG_FUNCTION_INFO_V1(url_path_set); +PG_FUNCTION_INFO_V1(url_query_set); +PG_FUNCTION_INFO_V1(url_fragment_set); + +/* + * Internal declarations. + */ +static lxb_url_t *url_parse(char *data, size_t length,const lxb_url_t *base); +static URL *url_change_part(URL *var_url, char *data, + size_t length, URLSetFunc set, const char *name); +static URL *url_new(char *data); +static lxb_status_t url_api_username_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *username, size_t length); +static lxb_status_t url_api_password_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *password, size_t length); +static URL *url_pack(lxb_url_t *url); +static uint32_t url_pack_size(lxb_url_t *url, uint32_t *head); +static void url_pack_string(const lexbor_str_t *str, uint8_t *data); +static void url_pack_scheme(const lxb_url_scheme_t *scheme, uint8_t *data); +static void url_pack_host(const lxb_url_host_t *host, uint8_t *data); +static void url_pack_path(const lxb_url_path_t *path, uint8_t *data); +static uint32_t url_string_size(const lexbor_str_t *str); +static uint32_t url_scheme_size(const lxb_url_scheme_t *scheme); +static uint32_t url_host_size(const lxb_url_host_t *host); +static uint32_t url_path_size(const lxb_url_path_t *path); +static void url_unpack(lxb_url_t *url, const uint8_t *data, lexbor_mraw_t *mraw); +static void url_unpack_string(lexbor_str_t *str, const uint8_t *data, + lexbor_mraw_t *mraw, URLIndex idx); +static void url_unpack_scheme(lxb_url_scheme_t *scheme, const uint8_t *data, + lexbor_mraw_t *mraw); +static void url_unpack_host(lxb_url_host_t *host, const uint8_t *data, + lexbor_mraw_t *mraw); +static void url_unpack_port(lxb_url_t *url, const uint8_t *data); +static void url_unpack_path(lxb_url_path_t *path, const uint8_t *data, + lexbor_mraw_t *mraw); +static void url_copy_data(lexbor_str_t *str, lexbor_mraw_t *mraw, + const uint8_t *data, uint32_t length); +static lxb_status_t url_callback(const lxb_char_t *data, size_t len, void *ctx); +static void *url_palloc0(size_t num, size_t size); + +/* + * Inline functions. + */ +static inline char * +url_encoding_encode(char *str, uint32_t length) +{ + return (char *) pg_do_encoding_conversion((unsigned char *) str, length, + PG_UTF8, GetDatabaseEncoding()); +} + +static inline char * +url_encoding_decode(char *str, uint32_t length) +{ + return (char *) pg_do_encoding_conversion((unsigned char *) str, length, + GetDatabaseEncoding(), PG_UTF8); +} + +static inline const uint32_t * +url_head_entry(const uint8_t *data, URLIndex idx) +{ + return ((uint32_t *) data) + idx; +} + +static inline uint32_t +url_entry_offset(const uint8_t *data, URLIndex idx) +{ + return *url_head_entry(data, idx); +} + +static inline uint32_t +url_entry_length(const uint8_t *data, URLIndex idx) +{ + const uint32_t *off = url_head_entry(data, idx); + return off[1] - off[0]; +} + + +/* + * All NULL arguments will be considered as empty value. + */ +static char url_empty_str[] = ""; + + +/* + * Module load callback. + */ +void +_PG_init(void) +{ + /* Lexbor supports overriding the allocation routines. */ + lexbor_memory_setup(palloc, repalloc, url_palloc0, pfree); +} + +/* + * Input/Output. + */ + +/* + * The Input function parses/validated the URL and packs the parsed URL into an + * internal storage format. + * + * Format: + * Head contains offsets for each part of the URL: + * Bergin: 0 byte. End: (sizeof(uint32_t) * URL_LAST_ENTRY) byte. + * + * To get the necessary offset we just need sizeof(uint32_t) * URLIndex. + * + * Body: + * After head, the body with the URL entries begins. + * + * URL_PORT stores the port directly, not the offset to the body. + */ +Datum +url_in(PG_FUNCTION_ARGS) +{ + PG_RETURN_URL_P(url_new(PG_GETARG_CSTRING(0))); +} + +Datum +url_out(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lxb_url_t url; + URLCallbackContext ctx; + + url_unpack(&url, URL_VARDATA(vardata), NULL); + + ctx.result = palloc(lxb_url_length(&url, false) + 1); + ctx.length = 0; + + lxb_url_serialize(&url, url_callback, &ctx, false); + + ctx.result[ctx.length] = 0x00; + + /* + * We will not convert the encoding (no matter what encoding is in the base) + * because the URL is always returned in ASCII. All encodings that Postgres + * supports understand ASCII < 0x80. + */ + + PG_RETURN_CSTRING(ctx.result); +} + +/* + * Getter functions for get parts of URL. + * Scheme, username, password, host, host_unicode, port, path, query, fragment. + */ +Datum +url_scheme(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lxb_url_scheme_t scheme; + + url_unpack_scheme(&scheme, URL_VARDATA(vardata), NULL); + + if (scheme.type == LXB_URL_SCHEMEL_TYPE__UNDEF) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text_with_len(URL_LXB_STR_ARGS(&scheme.name))); +} + +Datum +url_username(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lexbor_str_t str; + + url_unpack_string(&str, URL_VARDATA(vardata), NULL, URL_USERNAME); + + if (str.length == 0) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text_with_len(URL_LXB_STR_ARGS(&str))); +} + +Datum +url_password(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lexbor_str_t str; + + url_unpack_string(&str, URL_VARDATA(vardata), NULL, URL_PASSWORD); + + if (str.length == 0) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text_with_len(URL_LXB_STR_ARGS(&str))); +} + +Datum +url_host(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + text *txt; + size_t length; + URLCallbackContext ctx; + lxb_url_host_t host; + + url_unpack_host(&host, URL_VARDATA(vardata), NULL); + + if (host.type == LXB_URL_HOST_TYPE__UNDEF) + PG_RETURN_NULL(); + + length = lxb_url_host_length(&host); + txt = (text *) palloc(length + VARHDRSZ); + + ctx.result = VARDATA(txt); + ctx.length = 0; + + lxb_url_serialize_host(&host, url_callback, &ctx); + + SET_VARSIZE(txt, ctx.length + VARHDRSZ); + + PG_RETURN_TEXT_P(txt); +} + +Datum +url_host_unicode(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + char *result; + text *txt; + size_t length; + URLCallbackContext ctx; + lxb_url_host_t host; + + url_unpack_host(&host, URL_VARDATA(vardata), NULL); + + if (host.type == LXB_URL_HOST_TYPE__UNDEF) + PG_RETURN_NULL(); + + length = lxb_url_host_unicode_length(&host); + + txt = (text *) palloc(length + VARHDRSZ); + + ctx.result = VARDATA(txt); + ctx.length = 0; + + lxb_url_serialize_host_unicode(&host, url_callback, &ctx); + + result = url_encoding_encode(ctx.result, ctx.length); + + if (ctx.result != result) + { + ctx.length = strlen(result); + + /* + * Perhaps we should check the size, if it has not changed, then do not + * reallocate memory. + */ + pfree(txt); + txt = (text *) palloc(ctx.length + VARHDRSZ); + + memcpy(VARDATA(txt), result, ctx.length); + pfree(result); + } + + SET_VARSIZE(txt, ctx.length + VARHDRSZ); + + PG_RETURN_TEXT_P(txt); +} + +Datum +url_port(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lxb_url_t url; + + url_unpack_port(&url, URL_VARDATA(vardata)); + + if (!url.has_port) + PG_RETURN_NULL(); + + PG_RETURN_UINT16(url.port); +} + +Datum +url_path(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lxb_url_path_t path; + + url_unpack_path(&path, URL_VARDATA(vardata), NULL); + + if (path.str.length == 0) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text_with_len(URL_LXB_STR_ARGS(&path.str))); +} + +Datum +url_query(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lexbor_str_t str; + + url_unpack_string(&str, URL_VARDATA(vardata), NULL, URL_QUERY); + + if (str.length == 0) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text_with_len(URL_LXB_STR_ARGS(&str))); +} + +Datum +url_fragment(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + lexbor_str_t str; + + url_unpack_string(&str, URL_VARDATA(vardata), NULL, URL_FRAGMENT); + + if (str.length == 0) + PG_RETURN_NULL(); + + PG_RETURN_TEXT_P(cstring_to_text_with_len(URL_LXB_STR_ARGS(&str))); +} + +Datum +url_create(PG_FUNCTION_ARGS) +{ + PG_RETURN_URL_P(url_new(TextDatumGetCString(PG_GETARG_DATUM(0)))); +} + +Datum +url_base(PG_FUNCTION_ARGS) +{ + char *str; + URL *varbase; + lxb_url_t *url, url_base; + + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + + varbase = PG_GETARG_URL_P(0); + str = (PG_ARGISNULL(1)) ? url_empty_str + : TextDatumGetCString(PG_GETARG_DATUM(1)); + + url_unpack(&url_base, URL_VARDATA(varbase), NULL); + + url = url_parse(str, strlen(str), &url_base); + if (url == NULL) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("failed to parse the relative URL \"%s\"", str)); + + varbase = url_pack(url); + + lxb_url_memory_destroy(url); + + PG_RETURN_URL_P(varbase); +} + +/* + * Setter functions for change parts of URL. + * Scheme, username, password, host, hostname, port, path, query, fragment. + */ +#define URL_MAKE_SETTER_FUNCTION(name, func) \ + Datum \ + url_ ## name ## _set(PG_FUNCTION_ARGS) \ + { \ + char *str; \ + URL *vardata; \ + \ + if (PG_ARGISNULL(0)) \ + PG_RETURN_NULL(); \ + \ + vardata = PG_GETARG_URL_P(0); \ + str = (PG_ARGISNULL(1)) ? url_empty_str \ + : TextDatumGetCString(PG_GETARG_DATUM(1)); \ + \ + vardata = url_change_part(vardata, str, strlen(str), (func), #name); \ + \ + PG_RETURN_URL_P(vardata); \ + } + +URL_MAKE_SETTER_FUNCTION(scheme, lxb_url_api_protocol_set) +URL_MAKE_SETTER_FUNCTION(username, url_api_username_set) +URL_MAKE_SETTER_FUNCTION(password, url_api_password_set) +URL_MAKE_SETTER_FUNCTION(host, lxb_url_api_host_set) +URL_MAKE_SETTER_FUNCTION(hostname, lxb_url_api_hostname_set) +URL_MAKE_SETTER_FUNCTION(port, lxb_url_api_port_set) +URL_MAKE_SETTER_FUNCTION(path, lxb_url_api_pathname_set) +URL_MAKE_SETTER_FUNCTION(query, lxb_url_api_search_set) +URL_MAKE_SETTER_FUNCTION(fragment, lxb_url_api_hash_set) + +Datum +url_port_num_set(PG_FUNCTION_ARGS) +{ + URL *vardata = PG_GETARG_URL_P(0); + uint32_t port = PG_GETARG_UINT32(1); + int len; + char buf[12]; /* 10 digits, '\0' */ + + len = pg_ultoa_n(port, buf); + + vardata = url_change_part(vardata, buf, len, lxb_url_api_port_set, "port"); + + PG_RETURN_URL_P(vardata); +} + +/* + * Utilities for parsing, packaging, and unpacking URLs. + */ +static lxb_url_t * +url_parse(char *data, size_t length, const lxb_url_t *base) +{ + char *dst; + lxb_url_t *url; + lxb_status_t status; + lxb_url_parser_t parser; + + status = lxb_url_parser_init(&parser, NULL); + if (status != LXB_STATUS_OK) + ereport(ERROR, + errcode(ERRCODE_INTERNAL_ERROR), + errmsg("failed to create URL parser")); + + dst = url_encoding_decode(data, length); + + if (data == dst) + url = lxb_url_parse(&parser, base, (const lxb_char_t *) data, length); + else + { + url = lxb_url_parse(&parser, base, + (const lxb_char_t *) dst, strlen(dst)); + pfree(dst); + } + + lxb_url_parser_destroy(&parser, false); + + return url; +} + +static URL * +url_change_part(URL *var_url, char *data, size_t length, + URLSetFunc set, const char *name) +{ + char *dst; + URL *result; + lxb_url_t url; + lxb_status_t status; + lexbor_mraw_t mraw; + + lexbor_mraw_init(&mraw, 4096); + + url_unpack(&url, URL_VARDATA(var_url), &mraw); + + dst = url_encoding_decode(data, length); + + if (data == dst) + status = set(&url, NULL, (const lxb_char_t *) data, length); + else + { + status = set(&url, NULL, (const lxb_char_t *) dst, strlen(dst)); + pfree(dst); + } + + if (status != LXB_STATUS_OK) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("failed to parse \"%s\" part \"%.*s\" of URL", + name, (int) length, data)); + + result = url_pack(&url); + + lexbor_mraw_destroy(&mraw, false); + + return result; +} + +static URL * +url_new(char *data) +{ + URL *vardata; + lxb_url_t *url; + + url = url_parse(data, strlen(data), NULL); + if (url == NULL) + ereport(ERROR, + errcode(ERRCODE_SYNTAX_ERROR), + errmsg("failed to parse the URL \"%s\"", data)); + + vardata = url_pack(url); + + lxb_url_memory_destroy(url); + + return vardata; +} + +static lxb_status_t +url_api_username_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *username, size_t length) +{ + (void) parser; + return lxb_url_api_username_set(url, username, length); +} + +static lxb_status_t +url_api_password_set(lxb_url_t *url, lxb_url_parser_t *parser, + const lxb_char_t *password, size_t length) +{ + (void) parser; + return lxb_url_api_password_set(url, password, length); +} + +static URL * +url_pack(lxb_url_t *url) +{ + URL *vardata; + uint8_t *data; + uint32_t size; + uint32_t head[URL_LAST_ENTRY]; + + size = url_pack_size(url, head); + + vardata = palloc(VARHDRSZ + size); + data = URL_VARDATA(vardata); + + /* Store head. */ + memcpy(data, head, URL_HEAD_SIZE); + + /* Store body. */ + url_pack_scheme(&url->scheme, data + head[URL_SCHEME]); + url_pack_string(&url->username, data + head[URL_USERNAME]); + url_pack_string(&url->password, data + head[URL_PASSWORD]); + url_pack_host(&url->host, data + head[URL_HOST]); + url_pack_path(&url->path, data + head[URL_PATH]); + url_pack_string(&url->query, data + head[URL_QUERY]); + url_pack_string(&url->fragment, data + head[URL_FRAGMENT]); + + SET_VARSIZE(vardata, VARHDRSZ + size); + + return vardata; +} + +static uint32_t +url_pack_size(lxb_url_t *url, uint32_t *head) +{ + head[URL_SCHEME] = URL_HEAD_SIZE; + head[URL_USERNAME] = head[URL_SCHEME] + url_scheme_size(&url->scheme); + head[URL_PASSWORD] = head[URL_USERNAME] + url_string_size(&url->username); + head[URL_HOST] = head[URL_PASSWORD] + url_string_size(&url->password); + head[URL_PATH] = head[URL_HOST] + url_host_size(&url->host); + head[URL_QUERY] = head[URL_PATH] + url_path_size(&url->path); + head[URL_FRAGMENT] = head[URL_QUERY] + url_string_size(&url->query); + head[URL_SUM] = head[URL_FRAGMENT] + url_string_size(&url->fragment); + + /* + * Port has size uint16_t there is no sense to write it separately, we can + * write it to the header at once. + */ + head[URL_PORT] = url->port << 8 | (uint8_t) url->has_port; + + return head[URL_SUM]; +} + +static void +url_pack_string(const lexbor_str_t *str, uint8_t *data) +{ + if (str->length > 0) + memcpy(data, str->data, str->length); +} + +static void +url_pack_scheme(const lxb_url_scheme_t *scheme, uint8_t *data) +{ + if (scheme->type == LXB_URL_SCHEMEL_TYPE__UNDEF) + return; + + /* type + string. */ + *data = (uint8_t) scheme->type; + url_pack_string(&scheme->name, data + sizeof(uint8_t)); +} + +static void +url_pack_host(const lxb_url_host_t *host, uint8_t *data) +{ + *data = (uint8_t) host->type; + data += sizeof(uint8_t); + + switch (host->type) + { + case LXB_URL_HOST_TYPE_DOMAIN: + case LXB_URL_HOST_TYPE_OPAQUE: + url_pack_string(&host->u.domain, data); + break; + + case LXB_URL_HOST_TYPE_IPV4: + memcpy(data, &host->u.ipv4, sizeof(host->u.ipv4)); + break; + + case LXB_URL_HOST_TYPE_IPV6: + memcpy(data, host->u.ipv6, sizeof(host->u.ipv6)); + break; + + default: + break; + } +} + +static void +url_pack_path(const lxb_url_path_t *path, uint8_t *data) +{ + if (path->length == 0) + return; + + /* opaque + length + string. */ + *data = (uint8_t) path->opaque; + data += sizeof(uint8_t); + + *((uint32_t *) data) = (uint32_t) path->length; + data += sizeof(uint32_t); + + url_pack_string(&path->str, data); +} + +static uint32_t +url_string_size(const lexbor_str_t *str) +{ + return (uint32_t) str->length; +} + +static uint32_t +url_scheme_size(const lxb_url_scheme_t *scheme) +{ + if (scheme->type == LXB_URL_SCHEMEL_TYPE__UNDEF) + return 0; + + /* type + string. */ + return sizeof(uint8_t) + url_string_size(&scheme->name); +} + +static uint32_t +url_host_size(const lxb_url_host_t *host) +{ + uint32_t size; + + switch (host->type) + { + case LXB_URL_HOST_TYPE_DOMAIN: + case LXB_URL_HOST_TYPE_OPAQUE: + size = url_string_size(&host->u.domain); + break; + + case LXB_URL_HOST_TYPE_IPV4: + size = sizeof(host->u.ipv4); + break; + + case LXB_URL_HOST_TYPE_IPV6: + size = sizeof(host->u.ipv6); + break; + + default: + return 0; + } + + /* type + data. */ + return sizeof(uint8_t) + size; +} + +static uint32_t +url_path_size(const lxb_url_path_t *path) +{ + if (path->length == 0) + return 0; + + /* opaque + length + string. */ + return sizeof(uint8_t) + sizeof(uint32_t) + url_string_size(&path->str); +} + +static void +url_unpack(lxb_url_t *url, const uint8_t *data, lexbor_mraw_t *mraw) +{ + /* + * TODO: We should check the size of the header and body before unpacking. + * It is possible that the data will arrive broken and we will get “hello”. + */ + + /* + * We can use the current memory partitioning to serialize/read lxb_url_t + * (all getter functions). + * + * To change the URL object, we need to access the URL parser, which has + * its own memory partition for lxb_url_t. Therefore, we copy data for all + * setter functions. + */ + url->mraw = mraw; + + url_unpack_scheme(&url->scheme, data, mraw); + url_unpack_string(&url->username, data, mraw, URL_USERNAME); + url_unpack_string(&url->password, data, mraw, URL_PASSWORD); + url_unpack_host(&url->host, data, mraw); + url_unpack_port(url, data); + url_unpack_path(&url->path, data, mraw); + url_unpack_string(&url->query, data, mraw, URL_QUERY); + url_unpack_string(&url->fragment, data, mraw, URL_FRAGMENT); +} + +static void +url_unpack_string(lexbor_str_t *str, const uint8_t *data, lexbor_mraw_t *mraw, + URLIndex idx) +{ + lxb_char_t *begin; + const uint32_t offset = url_entry_offset(data, idx); + const uint32_t length = url_entry_length(data, idx); + + if (length == 0) + { + memset(str, 0x00, sizeof(lexbor_str_t)); + return; + } + + begin = (lxb_char_t *) &data[offset]; + + if (mraw == NULL) + { + str->data = begin; + str->length = length; + } + else + url_copy_data(str, mraw, begin, length); +} + +static void +url_unpack_scheme(lxb_url_scheme_t *scheme, const uint8_t *data, + lexbor_mraw_t *mraw) +{ + lxb_char_t *begin; + lexbor_str_t *str; + const uint32_t offset = url_entry_offset(data, URL_SCHEME); + uint32_t length = url_entry_length(data, URL_SCHEME); + + if (length == 0) + { + memset(scheme, 0x00, sizeof(lxb_url_scheme_t)); + return; + } + + str = &scheme->name; + length -= 1; /* skip type */ + + scheme->type = (lxb_url_scheme_type_t) data[offset]; + begin = (lxb_char_t *) &data[offset + 1]; + + if (mraw == NULL) + { + str->data = begin; + str->length = length; + } else + url_copy_data(str, mraw, begin, length); +} + +static void +url_unpack_host(lxb_url_host_t *host, const uint8_t *data, lexbor_mraw_t *mraw) +{ + lxb_char_t *begin; + const uint32_t offset = url_entry_offset(data, URL_HOST); + uint32_t length = url_entry_length(data, URL_HOST); + + if (length == 0) + { + memset(host, 0x00, sizeof(lxb_url_host_t)); + return; + } + + length -= 1; /* skip type */ + + host->type = (lxb_url_host_type_t) data[offset]; + begin = (lxb_char_t *) &data[offset + 1]; + + switch (host->type) + { + case LXB_URL_HOST_TYPE_DOMAIN: + case LXB_URL_HOST_TYPE_OPAQUE: + if (mraw == NULL) + { + host->u.domain.data = begin; + host->u.domain.length = length; + } else + url_copy_data(&host->u.domain, mraw, begin, length); + break; + + case LXB_URL_HOST_TYPE_IPV4: + memcpy(&host->u.ipv4, begin, length); + break; + + case LXB_URL_HOST_TYPE_IPV6: + memcpy(host->u.ipv6, begin, length); + break; + + default: + break; + } +} + +static void +url_unpack_port(lxb_url_t *url, const uint8_t *data) +{ + const uint32_t num = url_entry_offset(data, URL_PORT); + + url->port = num >> 8; + url->has_port = num & 1; +} + +static void +url_unpack_path(lxb_url_path_t *path, const uint8_t *data, lexbor_mraw_t *mraw) +{ + lxb_char_t *begin; + lexbor_str_t *str; + const uint32_t offset = url_entry_offset(data, URL_PATH); + uint32_t length = url_entry_length(data, URL_PATH); + + if (length == 0) + { + memset(path, 0x00, sizeof(lxb_url_path_t)); + return; + } + + str = &path->str; + length = length - sizeof(uint32_t) - 1; + + /* +sizeof(uint32_t) skip path->length; +1 skip path->opaque */ + begin = (lxb_char_t *) &data[offset + sizeof(uint32_t) + 1]; + + if (mraw == NULL) + { + str->data = begin; + str->length = length; + } else + url_copy_data(str, mraw, begin, length); + + path->opaque = (lxb_url_host_type_t) data[offset]; + path->length = *((uint32_t *) &data[offset + 1]); /* +1 skip path->opaque */ +} + +static void +url_copy_data(lexbor_str_t *str, lexbor_mraw_t *mraw, + const uint8_t *data, uint32_t length) +{ + str->data = lexbor_mraw_alloc(mraw, length + 1); + + memcpy(str->data, data, length); + + str->data[length] = 0x00; + str->length = length; +} + +static lxb_status_t +url_callback(const lxb_char_t *data, size_t len, void *ctx) +{ + URLCallbackContext *context = ctx; + + memcpy(context->result + context->length, data, len); + context->length += len; + + return LXB_STATUS_OK; +} + +static void * +url_palloc0(size_t num, size_t size) +{ + return palloc0(size * num); +} diff --git a/contrib/url/url.control b/contrib/url/url.control new file mode 100644 index 0000000000..bca8375a51 --- /dev/null +++ b/contrib/url/url.control @@ -0,0 +1,5 @@ +# url extension +comment = 'data type for Uniform Resource Locator' +default_version = '1.0' +module_pathname = '$libdir/url' +relocatable = true -- 2.39.5 (Apple Git-154)