Re: Automatic detection of client encoding

From: Karel Zak <zakkr(at)zf(dot)jcu(dot)cz>
To: Peter Eisentraut <peter_e(at)gmx(dot)net>
Cc: PostgreSQL Development <pgsql-hackers(at)postgresql(dot)org>
Subject: Re: Automatic detection of client encoding
Date: 2003-05-29 09:09:37
Message-ID: 20030529090937.GD21196@zf.jcu.cz
Views: Raw Message | Whole Thread | Download mbox | Resend email
Thread:
Lists: pgsql-hackers

On Wed, May 28, 2003 at 11:56:07PM +0200, Peter Eisentraut wrote:
> There is a standard interface (SUSv2) for detecting the character set
> based on the locale settings. I suggest we use this (if available) in
> applications like psql and pg_dump by default unless it is overridden by
> the usual mechanisms. If the character set name obtained this way is not
> recognized by PostgreSQL, we fall back to SQL_ASCII.
>
> Here's a piece of code that shows how this would work:
>
> #include <stdio.h>
> #include <locale.h>
> #include <langinfo.h>
>
> int
> main(int argc, char *argv[])
> {
> setlocale(LC_ALL, "");
> printf("%s\n", nl_langinfo(CODESET));
> return 0;
> }
>
> (LC_CTYPE is the governing category for this.)
>
> Comments?

It isn't enought for all OS. Please, look at glib or libcharset for this
problem.

http://www.haible.de/bruno/packages-libcharset.html

I use in my project following code that is simplification of
libcharset (the main function is mp_locale_charset()).
Maybe it will help you :-)

/* Determine a canonical name for the current locale's character encoding.
*
* mp_locale_charset() inspire with libcharset by:
*
* Copyright (C) 2000-2002 Free Software Foundation, Inc.
* Written by Bruno Haible <bruno(at)clisp(dot)org>.
*
* $Id: charset.c,v 1.2 2003/01/24 14:02:01 zakkr Exp $
*/

#include "mape.h"

#if HAVE_STDDEF_H
# include <stddef.h>
#endif

#include <stdio.h>
#if HAVE_STRING_H
# include <string.h>
#else
# include <strings.h>
#endif
#if HAVE_STDLIB_H
# include <stdlib.h>
#endif

#if defined _WIN32 || defined __WIN32__
# undef WIN32 /* avoid warning on mingw32 */
# define WIN32
#endif

#if defined __EMX__
/* Assume EMX program runs on OS/2, even if compiled under DOS. */
# define OS2
#endif

#if !defined WIN32
# if HAVE_LANGINFO_CODESET
# include <langinfo.h>
# else
# if HAVE_SETLOCALE
# include <locale.h>
# endif
# endif
#elif defined WIN32
# define WIN32_LEAN_AND_MEAN
# include <windows.h>
#endif
#if defined OS2
# define INCL_DOS
# include <os2.h>
#endif

typedef struct MpCharsetAlias
{
char *alias,
*name;
} MpCharsetAlias;

extern mpbool mp_locale_charset (char **charset);

/*
* The libcharset load all from external text file, but it's strange and
* slow solution, we rather use array(s) compiled into source. In the
* "good" libc this is not needful -- for example in linux.
*
* Please, put to this funtion exotic aliases only. The libc 'iconv' knows
* a lot of basic aliases (check it first by iconv -l).
*
*/
static const char *
mp_charset_aliases (const char *name)
{
MpCharsetAlias *a;

#if defined WIN32
MpCharsetAlias aliases[] =
{
{ "CP936", "GBK" },
{ "CP1361", "JOHAB" },
{ "CP20127", "ASCII" },
{ "CP20866", "KOI8-R" },
{ "CP21866", "KOI8-RU" },
{ "CP28591", "ISO-8859-1" },
{ "CP28592", "ISO-8859-2" },
{ "CP28593", "ISO-8859-3" },
{ "CP28594", "ISO-8859-4" },
{ "CP28595", "ISO-8859-5" },
{ "CP28596", "ISO-8859-6" },
{ "CP28597", "ISO-8859-7" },
{ "CP28598", "ISO-8859-8" },
{ "CP28599", "ISO-8859-9" },
{ "CP28605", "ISO-8859-15" },
{ NULL, NULL }
};
#elif PORTNAME == aix
MpCharsetAlias aliases[] =
{
{ "IBM-850", "CP850" },
{ "IBM-856", "CP856" },
{ "IBM-921", "ISO-8859-13" },
{ "IBM-922", "CP922" },
{ "IBM-932", "CP932" },
{ "IBM-943", "CP943" },
{ "IBM-1046", "CP1046" },
{ "IBM-1124", "CP1124" },
{ "IBM-1129", "CP1129" },
{ "IBM-1252", "CP1252" },
{ "IBM-EUCCN", "GB2312" },
{ "IBM-EUCJP", "EUC-JP" },
{ "IBM-EUCKR", "EUC-KR" },
{ "IBM-EUCTW", "EUC-TW" },
{ NULL, NULL }
};
#elif PORTNAME == hpux
MpCharsetAlias aliases[] =
{
{ "ROMAN8", "HP-ROMAN8" },
{ "ARABIC8", "HP-ARABIC8" },
{ "GREEK8", "HP-GREEK8" },
{ "HEBREW8", "HP-HEBREW8" },
{ "TURKISH8", "HP-TURKISH8" },
{ "KANA8", "HP-KANA8" },
{ "HP15CN", "GB2312" },
{ NULL, NULL }
};
#elif (PORTNAME == irix || PORTNAME == irix5)
MpCharsetAlias aliases[] =
{
{ "EUCCN", "GB2312" },
{ NULL, NULL }
};
#elif PORTNAME == osf
MpCharsetAlias aliases[] =
{
{ "KSC5601", "CP949" },
{ "SDECKANJI", "EUC-JP" },
{ "TACTIS", "TIS-620" },
{ NULL, NULL }
};
#elif (PORTNAME == solaris || PORTNAME == solaris_sparc || POSRTNAME == solaris_i386)
MpCharsetAlias aliases[] =
{
{ "646", "ASCII" },
{ "CNS11643", "EUC-TW" },
{ "5601", "EUC-KR" },
{ "JOHAP92", "JOHAB" },
{ "PCK", "SHIFT_JIS" },
{ "2533", "TIS-620" },
{ NULL, NULL }
};
#elif PORTNAME == netbsd
MpCharsetAlias aliases[] =
{
{ "646", " ASCII" },
{ "EUCCN", "GB2312" },
{ NULL, NULL }
};
#else
return name;
#endif

if (aliases)
{
for (a = aliases; a->alias; a++)
if (strcasecmp (a->alias, name) == 0)
return a->name;
}

/* we return original name beacuse iconv() probably will know
* something better about name if we don't know it :-)
*/
return name;
}

/* Returns charset from "language_COUNTRY(dot)charset(at)modifier" string */
#ifndef HAVE_LANGINFO_CODESET
static char *
mp_encoding_from_locale(char *locale)
{
char *dot = strchr (locale, '.');

if (dot != NULL)
{
const char *modifier;
static char buf[2 + 10 + 1];

dot++;

/* Look for the possible @... trailer and remove it, if any. */
modifier = strchr (dot, '@');

if (modifier == NULL)
return dot;
if (modifier - dot < sizeof (buf))
{
memcpy (buf, dot, modifier - dot);
buf [modifier - dot] = '\0';
return buf;
}
}
return locale;
}
#endif

mpbool
mp_locale_charset (char **charset)
{
const char *codeset;

#if !(defined WIN32 || defined OS2)

# if HAVE_LANGINFO_CODESET
/* Most systems support nl_langinfo (CODESET) nowadays. */
codeset = nl_langinfo (CODESET);
# else
/* On old systems which lack it, use setlocale or getenv. */
const char *locale = NULL;

/* But most old systems don't have a complete set of locales. Some
* (like SunOS 4 or DJGPP) have only the C locale. Therefore we don't
* use setlocale here; it would return "C" when it doesn't support the
* locale name the user has set.
*/
# if HAVE_SETLOCALE && 0
locale = setlocale (LC_CTYPE, NULL);
# endif
if (locale == NULL || locale[0] == '\0')
{
locale = getenv ("LC_ALL");
if (locale == NULL || locale[0] == '\0')
{
locale = getenv ("LC_CTYPE");
if (locale == NULL || locale[0] == '\0')
locale = getenv ("LANG");
}
}

/* On some old systems, one used to set locale = "iso8859_1". On others,
* you set it to "language_COUNTRY.charset". In any case, we resolve it
* through the charset.alias file.
*/
codeset = mp_encoding_from_locale(locale);
# endif /* HAVE_LANGINFO_CODESET */

#elif defined WIN32

static char buf[2 + 10 + 1];

/* Woe32 has a function returning the locale's codepage as a number. */
sprintf (buf, "CP%u", GetACP ());
codeset = buf;

#elif defined OS2

const char *locale;
static char buf[2 + 10 + 1];
ULONG cp[3];
ULONG cplen;

/* Allow user to override the codeset, as set in the operating system,
* with standard language environment variables.
*/
locale = getenv ("LC_ALL");
if (locale == NULL || locale[0] == '\0')
{
locale = getenv ("LC_CTYPE");
if (locale == NULL || locale[0] == '\0')
locale = getenv ("LANG");
}
if (locale != NULL && locale[0] != '\0')
codeset = mp_encoding_from_locale(locale);
else
{
/* OS/2 has a function returning the locale's codepage as a number. */
if (DosQueryCp (sizeof (cp), cp, &cplen))
codeset = "";
else
{
sprintf (buf, "CP%u", cp[0]);
codeset = buf;
}
}
#endif
if (codeset == NULL)
/* The canonical name cannot be determined. */
codeset = "";
else
codeset = mp_charset_aliases (codeset);

/* Don't return an empty string. GNU libc and GNU libiconv interpret
* the empty string as denoting "the locale's character encoding",
* thus GNU libiconv would call this function a second time.
*/
if (codeset[0] == '\0')
{
/*
* Last possibility is 'CHARSET' enviroment variable
*/
if (!(codeset = getenv ("CHARSET")))
codeset = "ASCII";
}

if (charset)
*charset = (char *) codeset;

if (strcasecmp(codeset, "UTF8")==0 || strcasecmp(codeset, "UTF-8")==0)
return TRUE;

return FALSE;
}

autoconf part:
-------------

AC_DEFUN(jm_LANGINFO_CODESET,
[
AC_CHECK_HEADERS(langinfo.h)
AC_CHECK_FUNCS(nl_langinfo)

AC_CACHE_CHECK([for nl_langinfo and CODESET], jm_cv_langinfo_codeset,
[AC_TRY_LINK([#include <langinfo.h>],
[char* cs = nl_langinfo(CODESET);],
jm_cv_langinfo_codeset=yes,
jm_cv_langinfo_codeset=no)
])
if test $jm_cv_langinfo_codeset = yes; then
AC_DEFINE(HAVE_LANGINFO_CODESET, 1,
[Define if you have <langinfo.h> and nl_langinfo(CODESET).])
fi
])

--
Karel Zak <zakkr(at)zf(dot)jcu(dot)cz>
http://home.zf.jcu.cz/~zakkr/

In response to

Browse pgsql-hackers by date

  From Date Subject
Next Message ow 2003-05-29 11:26:00 No more RH7.3 RPMs?
Previous Message John DeSoi 2003-05-29 03:56:45 Re: 3.0 fe/be protocol bug?