User-defined analyzer

A user-defined analyzer processes text into tokens according to a user-defined function.

You can write a user-defined function to process text into tokens according to your needs. Use udr.function_name as the analyzer name with the analyzer option when you create a basic text search index.

Examples

The following function, which is written in C, processes alphabetical and numeric characters into tokens and ignores all special characters except underscore (_):

/*ARGSUSED*/
UDREXPORT
mi_lvarchar* tokenize_alnum(
    mi_lvarchar*    string,
    MI_FPARAM*      fparam)
{
    mi_integer      status = MI_OK;
    mi_lvarchar*    rtn = NULL;
    gl_mchar_t*     src = NULL;
    gl_mchar_t*     tgt = NULL;
    mi_integer      token = 0;
    gl_mchar_t*     s;
    gl_mchar_t*     r;

    ifx_gl_init();
    if (((src = (gl_mchar_t*)mi_lvarchar_to_string(string)) == NULL) ||
        ((tgt = (gl_mchar_t*)mi_alloc((strlen(src)*4)+1)) == NULL)) {
        status = MI_ERROR;
        goto cleanup;
    }
    s = src;
    r = tgt;
    while ((s != NULL) && (*s != '\0')) {
        if ((ifx_gl_ismalnum(s, IFX_GL_NO_LIMIT)) || (*s == '_')) {
            if (!token) {
                if (r != tgt) *r++ = ' ';
                *r++ = '[';
                token = 1;
            }
            ifx_gl_mbsncpy(r, s, IFX_GL_NULL, 1);
            r = ifx_gl_mbsnext(r, IFX_GL_NO_LIMIT);
        }
        else {
            if (token) {
                *r++ = ']';
                token = 0;
            }
        }
        s = ifx_gl_mbsnext(s, IFX_GL_NO_LIMIT);
    }
    if (token) *r++ = ']';
    *r = '\0';
    if ((rtn = mi_string_to_lvarchar((char*)tgt)) == NULL) {
        status = MI_ERROR;
        goto cleanup;
    }
cleanup:
    if ((status != MI_OK) &&
        (rtn != NULL)) {
        mi_var_free(rtn);
        rtn = NULL;
    }
    if (tgt != NULL) mi_free(tgt);
    if (src != NULL) mi_free(src);
    if (rtn == NULL) mi_fp_setreturnisnull(fparam, 0, MI_TRUE);
    return rtn;
}

The following statement registers the function so that the database server can use it:

CREATE FUNCTION tokenize_alnum (lvarchar)
    RETURNS lvarchar
    WITH (NOT VARIANT)
    EXTERNAL NAME "$ONEDB_HOME/extend/myblade/myblade.bld(tokenize_alnum)" 
        LANGUAGE C;

When an index is created with the analyzer="udr.tokenize_alnum" option, the following example shows that no special characters except the underscore are indexed:

quick! #$%&^^$## Brown fox under_score 
[quick] [Brown] [fox] [under_score]