privoxy-3.0.21-stable/./cgi.c 000640 001751 001751 00000223323 12060362340 014671 0 ustar 00fk fk 000000 000000 const char cgi_rcs[] = "$Id: cgi.c,v 1.158 2012/12/07 12:45:20 fabiankeil Exp $";
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/cgi.c,v $
*
* Purpose : Declares functions to intercept request, generate
* html or gif answers, and to compose HTTP resonses.
* This only contains the framework functions, the
* actual handler functions are declared elsewhere.
*
* Copyright : Written by and Copyright (C) 2001-2004, 2006-2008
* the SourceForge Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
* by and Copyright (C) 1997 Anonymous Coders and
* Junkbusters Corporation. http://www.junkbusters.com
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
**********************************************************************/
#include "config.h"
#include
#include
#include
#include
#include
#include
#include
#ifdef FEATURE_COMPRESSION
#include
#endif
#include "project.h"
#include "cgi.h"
#include "list.h"
#include "encode.h"
#include "ssplit.h"
#include "errlog.h"
#include "filters.h"
#include "miscutil.h"
#include "cgisimple.h"
#include "jbsockets.h"
#if defined(FEATURE_CGI_EDIT_ACTIONS) || defined(FEATURE_TOGGLE)
#include "cgiedit.h"
#endif /* defined(FEATURE_CGI_EDIT_ACTIONS) || defined (FEATURE_TOGGLE) */
/* loadcfg.h is for global_toggle_state only */
#include "loadcfg.h"
/* jcc.h is for mutex semaphore globals only */
#include "jcc.h"
const char cgi_h_rcs[] = CGI_H_VERSION;
/*
* List of CGI functions: name, handler, description
* Note: Do NOT use single quotes in the description;
* this will break the dynamic "blocked" template!
*/
static const struct cgi_dispatcher cgi_dispatchers[] = {
{ "",
cgi_default,
"Privoxy main page",
TRUE },
#ifdef FEATURE_GRACEFUL_TERMINATION
{ "die",
cgi_die,
"Shut down - Do not deploy this build in a production environment, "
"this is a one click Denial Of Service attack!!!",
FALSE },
#endif
{ "show-status",
cgi_show_status,
#ifdef FEATURE_CGI_EDIT_ACTIONS
"View & change the current configuration",
#else
"View the current configuration",
#endif
TRUE },
{ "show-version",
cgi_show_version,
"View the source code version numbers",
TRUE },
{ "show-request",
cgi_show_request,
"View the request headers",
TRUE },
{ "show-url-info",
cgi_show_url_info,
"Look up which actions apply to a URL and why",
TRUE },
#ifdef FEATURE_TOGGLE
{ "toggle",
cgi_toggle,
"Toggle Privoxy on or off",
FALSE },
#endif /* def FEATURE_TOGGLE */
#ifdef FEATURE_CGI_EDIT_ACTIONS
{ "edit-actions", /* Edit the actions list */
cgi_edit_actions,
NULL, FALSE },
{ "eaa", /* Shortcut for edit-actions-add-url-form */
cgi_edit_actions_add_url_form,
NULL, FALSE },
{ "eau", /* Shortcut for edit-actions-url-form */
cgi_edit_actions_url_form,
NULL, FALSE },
{ "ear", /* Shortcut for edit-actions-remove-url-form */
cgi_edit_actions_remove_url_form,
NULL, FALSE },
{ "eal", /* Shortcut for edit-actions-list */
cgi_edit_actions_list,
NULL, FALSE },
{ "eafu", /* Shortcut for edit-actions-for-url */
cgi_edit_actions_for_url,
NULL, FALSE },
{ "eas", /* Shortcut for edit-actions-submit */
cgi_edit_actions_submit,
NULL, FALSE },
{ "easa", /* Shortcut for edit-actions-section-add */
cgi_edit_actions_section_add,
NULL, FALSE },
{ "easr", /* Shortcut for edit-actions-section-remove */
cgi_edit_actions_section_remove,
NULL, FALSE },
{ "eass", /* Shortcut for edit-actions-section-swap */
cgi_edit_actions_section_swap,
NULL, FALSE },
{ "edit-actions-for-url",
cgi_edit_actions_for_url,
NULL, FALSE /* Edit the actions for (a) specified URL(s) */ },
{ "edit-actions-list",
cgi_edit_actions_list,
NULL, TRUE /* Edit the actions list */ },
{ "edit-actions-submit",
cgi_edit_actions_submit,
NULL, FALSE /* Change the actions for (a) specified URL(s) */ },
{ "edit-actions-url",
cgi_edit_actions_url,
NULL, FALSE /* Change a URL pattern in the actionsfile */ },
{ "edit-actions-url-form",
cgi_edit_actions_url_form,
NULL, FALSE /* Form to change a URL pattern in the actionsfile */ },
{ "edit-actions-add-url",
cgi_edit_actions_add_url,
NULL, FALSE /* Add a URL pattern to the actionsfile */ },
{ "edit-actions-add-url-form",
cgi_edit_actions_add_url_form,
NULL, FALSE /* Form to add a URL pattern to the actionsfile */ },
{ "edit-actions-remove-url",
cgi_edit_actions_remove_url,
NULL, FALSE /* Remove a URL pattern from the actionsfile */ },
{ "edit-actions-remove-url-form",
cgi_edit_actions_remove_url_form,
NULL, FALSE /* Form to remove a URL pattern from the actionsfile */ },
{ "edit-actions-section-add",
cgi_edit_actions_section_add,
NULL, FALSE /* Remove a section from the actionsfile */ },
{ "edit-actions-section-remove",
cgi_edit_actions_section_remove,
NULL, FALSE /* Remove a section from the actionsfile */ },
{ "edit-actions-section-swap",
cgi_edit_actions_section_swap,
NULL, FALSE /* Swap two sections in the actionsfile */ },
#endif /* def FEATURE_CGI_EDIT_ACTIONS */
{ "error-favicon.ico",
cgi_send_error_favicon,
NULL, TRUE /* Sends the favicon image for error pages. */ },
{ "favicon.ico",
cgi_send_default_favicon,
NULL, TRUE /* Sends the default favicon image. */ },
{ "robots.txt",
cgi_robots_txt,
NULL, TRUE /* Sends a robots.txt file to tell robots to go away. */ },
{ "send-banner",
cgi_send_banner,
NULL, TRUE /* Send a built-in image */ },
{ "send-stylesheet",
cgi_send_stylesheet,
NULL, FALSE /* Send templates/cgi-style.css */ },
{ "t",
cgi_transparent_image,
NULL, TRUE /* Send a transparent image (short name) */ },
{ "url-info-osd.xml",
cgi_send_url_info_osd,
NULL, TRUE /* Send templates/url-info-osd.xml */ },
{ "user-manual",
cgi_send_user_manual,
NULL, TRUE /* Send user-manual */ },
{ NULL, /* NULL Indicates end of list and default page */
cgi_error_404,
NULL, TRUE /* Unknown CGI page */ }
};
/*
* Built-in images for ad replacement
*
* Hint: You can encode your own images like this:
* cat your-image | perl -e 'while (read STDIN, $c, 1) { printf("\\%.3o", unpack("C", $c)); }'
*/
#ifdef FEATURE_NO_GIFS
/*
* Checkerboard pattern, as a PNG.
*/
const char image_pattern_data[] =
"\211\120\116\107\015\012\032\012\000\000\000\015\111\110\104"
"\122\000\000\000\004\000\000\000\004\010\006\000\000\000\251"
"\361\236\176\000\000\000\006\142\113\107\104\000\000\000\000"
"\000\000\371\103\273\177\000\000\000\033\111\104\101\124\010"
"\327\143\140\140\140\060\377\377\377\077\003\234\106\341\060"
"\060\230\063\020\124\001\000\161\021\031\241\034\364\030\143"
"\000\000\000\000\111\105\116\104\256\102\140\202";
/*
* 1x1 transparant PNG.
*/
const char image_blank_data[] =
"\211\120\116\107\015\012\032\012\000\000\000\015\111\110\104\122"
"\000\000\000\001\000\000\000\001\001\003\000\000\000\045\333\126"
"\312\000\000\000\003\120\114\124\105\377\377\377\247\304\033\310"
"\000\000\000\001\164\122\116\123\000\100\346\330\146\000\000\000"
"\001\142\113\107\104\000\210\005\035\110\000\000\000\012\111\104"
"\101\124\170\001\143\140\000\000\000\002\000\001\163\165\001\030"
"\000\000\000\000\111\105\116\104\256\102\140\202";
#else
/*
* Checkerboard pattern, as a GIF.
*/
const char image_pattern_data[] =
"\107\111\106\070\071\141\004\000\004\000\200\000\000\310\310"
"\310\377\377\377\041\376\016\111\040\167\141\163\040\141\040"
"\142\141\156\156\145\162\000\041\371\004\001\012\000\001\000"
"\054\000\000\000\000\004\000\004\000\000\002\005\104\174\147"
"\270\005\000\073";
/*
* 1x1 transparant GIF.
*/
const char image_blank_data[] =
"GIF89a\001\000\001\000\200\000\000\377\377\377\000\000"
"\000!\371\004\001\000\000\000\000,\000\000\000\000\001"
"\000\001\000\000\002\002D\001\000;";
#endif
const size_t image_pattern_length = sizeof(image_pattern_data) - 1;
const size_t image_blank_length = sizeof(image_blank_data) - 1;
#ifdef FEATURE_COMPRESSION
/*
* Minimum length which a buffer has to reach before
* we bother to (re-)compress it. Completely arbitrary.
*/
const size_t LOWER_LENGTH_LIMIT_FOR_COMPRESSION = 1024U;
#endif
static struct http_response cgi_error_memory_response[1];
static struct http_response *dispatch_known_cgi(struct client_state * csp,
const char * path);
static struct map *parse_cgi_parameters(char *argstring);
/*********************************************************************
*
* Function : dispatch_cgi
*
* Description : Checks if a request URL has either the magical
* hostname CGI_SITE_1_HOST (usually http://p.p/) or
* matches CGI_SITE_2_HOST CGI_SITE_2_PATH (usually
* http://config.privoxy.org/). If so, it passes
* the (rest of the) path onto dispatch_known_cgi, which
* calls the relevant CGI handler function.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
*
* Returns : http_response if match, NULL if nonmatch or handler fail
*
*********************************************************************/
struct http_response *dispatch_cgi(struct client_state *csp)
{
const char *host = csp->http->host;
const char *path = csp->http->path;
/*
* Should we intercept ?
*/
/* Note: "example.com" and "example.com." are equivalent hostnames. */
/* Either the host matches CGI_SITE_1_HOST ..*/
if ( ( (0 == strcmpic(host, CGI_SITE_1_HOST))
|| (0 == strcmpic(host, CGI_SITE_1_HOST ".")))
&& (path[0] == '/'))
{
/* ..then the path will all be for us. Remove leading '/' */
path++;
}
/* Or it's the host part CGI_SITE_2_HOST, and the path CGI_SITE_2_PATH */
else if (( (0 == strcmpic(host, CGI_SITE_2_HOST))
|| (0 == strcmpic(host, CGI_SITE_2_HOST ".")))
&& (0 == strncmpic(path, CGI_SITE_2_PATH, strlen(CGI_SITE_2_PATH))))
{
/* take everything following CGI_SITE_2_PATH */
path += strlen(CGI_SITE_2_PATH);
if (*path == '/')
{
/* skip the forward slash after CGI_SITE_2_PATH */
path++;
}
else if (*path != '\0')
{
/*
* weirdness: URL is /configXXX, where XXX is some string
* Do *NOT* intercept.
*/
return NULL;
}
}
else
{
/* Not a CGI */
return NULL;
}
if (strcmpic(csp->http->gpc, "GET")
&& strcmpic(csp->http->gpc, "HEAD"))
{
log_error(LOG_LEVEL_ERROR,
"CGI request with unsupported method received: %s", csp->http->gpc);
/*
* The CGI pages currently only support GET and HEAD requests.
*
* If the client used a different method, ditch any data following
* the current headers to reduce the likelihood of parse errors
* with the following request.
*/
csp->client_iob->eod = csp->client_iob->cur;
}
/*
* This is a CGI call.
*/
return dispatch_known_cgi(csp, path);
}
/*********************************************************************
*
* Function : grep_cgi_referrer
*
* Description : Ugly provisorical fix that greps the value of the
* referer HTTP header field out of a linked list of
* strings like found at csp->headers. Will disappear
* in Privoxy 3.1.
*
* FIXME: csp->headers ought to be csp->http->headers
* FIXME: Parsing all client header lines should
* happen right after the request is received!
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
*
* Returns : pointer to value (no copy!), or NULL if none found.
*
*********************************************************************/
static char *grep_cgi_referrer(const struct client_state *csp)
{
struct list_entry *p;
for (p = csp->headers->first; p != NULL; p = p->next)
{
if (p->str == NULL) continue;
if (strncmpic(p->str, "Referer: ", 9) == 0)
{
return ((p->str) + 9);
}
}
return NULL;
}
/*********************************************************************
*
* Function : referrer_is_safe
*
* Description : Decides whether we trust the Referer for
* CGI pages which are only meant to be reachable
* through Privoxy's web interface directly.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
*
* Returns : TRUE if the referrer is safe, or
* FALSE if the referrer is unsafe or not set.
*
*********************************************************************/
static int referrer_is_safe(const struct client_state *csp)
{
char *referrer;
static const char alternative_prefix[] = "http://" CGI_SITE_1_HOST "/";
referrer = grep_cgi_referrer(csp);
if (NULL == referrer)
{
/* No referrer, no access */
log_error(LOG_LEVEL_ERROR, "Denying access to %s. No referrer found.",
csp->http->url);
}
else if ((0 == strncmp(referrer, CGI_PREFIX, sizeof(CGI_PREFIX)-1)
|| (0 == strncmp(referrer, alternative_prefix, strlen(alternative_prefix)))))
{
/* Trustworthy referrer */
log_error(LOG_LEVEL_CGI, "Granting access to %s, referrer %s is trustworthy.",
csp->http->url, referrer);
return TRUE;
}
else
{
/* Untrustworthy referrer */
log_error(LOG_LEVEL_ERROR, "Denying access to %s, referrer %s isn't trustworthy.",
csp->http->url, referrer);
}
return FALSE;
}
/*********************************************************************
*
* Function : dispatch_known_cgi
*
* Description : Processes a CGI once dispatch_cgi has determined that
* it matches one of the magic prefixes. Parses the path
* as a cgi name plus query string, prepares a map that
* maps CGI parameter names to their values, initializes
* the http_response struct, and calls the relevant CGI
* handler function.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : path = Path of CGI, with the CGI prefix removed.
* Should not have a leading "/".
*
* Returns : http_response, or NULL on handler failure or out of
* memory.
*
*********************************************************************/
static struct http_response *dispatch_known_cgi(struct client_state * csp,
const char * path)
{
const struct cgi_dispatcher *d;
struct map *param_list;
struct http_response *rsp;
char *query_args_start;
char *path_copy;
jb_err err;
if (NULL == (path_copy = strdup(path)))
{
return cgi_error_memory();
}
query_args_start = path_copy;
while (*query_args_start && *query_args_start != '?' && *query_args_start != '/')
{
query_args_start++;
}
if (*query_args_start == '/')
{
*query_args_start++ = '\0';
if ((param_list = new_map()))
{
map(param_list, "file", 1, url_decode(query_args_start), 0);
}
}
else
{
if (*query_args_start == '?')
{
*query_args_start++ = '\0';
}
if (NULL == (param_list = parse_cgi_parameters(query_args_start)))
{
free(path_copy);
return cgi_error_memory();
}
}
/*
* At this point:
* path_copy = CGI call name
* param_list = CGI params, as map
*/
/* Get mem for response or fail*/
if (NULL == (rsp = alloc_http_response()))
{
free(path_copy);
free_map(param_list);
return cgi_error_memory();
}
/*
* Find and start the right CGI function
*/
d = cgi_dispatchers;
for (;;)
{
if ((d->name == NULL) || (strcmp(path_copy, d->name) == 0))
{
/*
* If the called CGI is either harmless, or referred
* from a trusted source, start it.
*/
if (d->harmless || referrer_is_safe(csp))
{
err = (d->handler)(csp, rsp, param_list);
}
else
{
/*
* Else, modify toggle calls so that they only display
* the status, and deny all other calls.
*/
if (0 == strcmp(path_copy, "toggle"))
{
unmap(param_list, "set");
err = (d->handler)(csp, rsp, param_list);
}
else
{
err = cgi_error_disabled(csp, rsp);
}
}
free(path_copy);
free_map(param_list);
if (err == JB_ERR_CGI_PARAMS)
{
err = cgi_error_bad_param(csp, rsp);
}
if (err && (err != JB_ERR_MEMORY))
{
/* Unexpected error! Shouldn't get here */
log_error(LOG_LEVEL_ERROR,
"Unexpected CGI error %d in top-level handler. "
"Please file a bug report!", err);
err = cgi_error_unknown(csp, rsp, err);
}
if (!err)
{
/* It worked */
rsp->crunch_reason = CGI_CALL;
return finish_http_response(csp, rsp);
}
else
{
/* Error in handler, probably out-of-memory */
free_http_response(rsp);
return cgi_error_memory();
}
}
d++;
}
}
/*********************************************************************
*
* Function : parse_cgi_parameters
*
* Description : Parse a URL-encoded argument string into name/value
* pairs and store them in a struct map list.
*
* Parameters :
* 1 : argstring = string to be parsed. Will be trashed.
*
* Returns : pointer to param list, or NULL if out of memory.
*
*********************************************************************/
static struct map *parse_cgi_parameters(char *argstring)
{
char *p;
char **vector;
int pairs, i;
struct map *cgi_params;
/*
* XXX: This estimate is guaranteed to be high enough as we
* let ssplit() ignore empty fields, but also a bit wasteful.
* The same hack is used in get_last_url() so it looks like
* a real solution is needed.
*/
size_t max_segments = strlen(argstring) / 2;
if (max_segments == 0)
{
/*
* XXX: If the argstring is empty, there's really
* no point in creating a param list, but currently
* other parts of Privoxy depend on the list's existence.
*/
max_segments = 1;
}
vector = malloc_or_die(max_segments * sizeof(char *));
if (NULL == (cgi_params = new_map()))
{
freez(vector);
return NULL;
}
/*
* IE 5 does, of course, violate RFC 2316 Sect 4.1 and sends
* the fragment identifier along with the request, so we must
* cut it off here, so it won't pollute the CGI params:
*/
if (NULL != (p = strchr(argstring, '#')))
{
*p = '\0';
}
pairs = ssplit(argstring, "&", vector, max_segments);
assert(pairs != -1);
if (pairs == -1)
{
freez(vector);
free_map(cgi_params);
return NULL;
}
for (i = 0; i < pairs; i++)
{
if ((NULL != (p = strchr(vector[i], '='))) && (*(p+1) != '\0'))
{
*p = '\0';
if (map(cgi_params, url_decode(vector[i]), 0, url_decode(++p), 0))
{
freez(vector);
free_map(cgi_params);
return NULL;
}
}
}
freez(vector);
return cgi_params;
}
/*********************************************************************
*
* Function : get_char_param
*
* Description : Get a single-character parameter passed to a CGI
* function.
*
* Parameters :
* 1 : parameters = map of cgi parameters
* 2 : param_name = The name of the parameter to read
*
* Returns : Uppercase character on success, '\0' on error.
*
*********************************************************************/
char get_char_param(const struct map *parameters,
const char *param_name)
{
char ch;
assert(parameters);
assert(param_name);
ch = *(lookup(parameters, param_name));
if ((ch >= 'a') && (ch <= 'z'))
{
ch = (char)(ch - 'a' + 'A');
}
return ch;
}
/*********************************************************************
*
* Function : get_string_param
*
* Description : Get a string paramater, to be used as an
* ACTION_STRING or ACTION_MULTI paramater.
* Validates the input to prevent stupid/malicious
* users from corrupting their action file.
*
* Parameters :
* 1 : parameters = map of cgi parameters
* 2 : param_name = The name of the parameter to read
* 3 : pparam = destination for paramater. Allocated as
* part of the map "parameters", so don't free it.
* Set to NULL if not specified.
*
* Returns : JB_ERR_OK on success, or if the paramater
* was not specified.
* JB_ERR_MEMORY on out-of-memory.
* JB_ERR_CGI_PARAMS if the paramater is not valid.
*
*********************************************************************/
jb_err get_string_param(const struct map *parameters,
const char *param_name,
const char **pparam)
{
const char *param;
const char *s;
char ch;
assert(parameters);
assert(param_name);
assert(pparam);
*pparam = NULL;
param = lookup(parameters, param_name);
if (!*param)
{
return JB_ERR_OK;
}
if (strlen(param) >= CGI_PARAM_LEN_MAX)
{
/*
* Too long.
*
* Note that the length limit is arbitrary, it just seems
* sensible to limit it to *something*. There's no
* technical reason for any limit at all.
*/
return JB_ERR_CGI_PARAMS;
}
/* Check every character to see if it's legal */
s = param;
while ((ch = *s++) != '\0')
{
if (((unsigned char)ch < (unsigned char)' ')
|| (ch == '}'))
{
/* Probable hack attempt, or user accidentally used '}'. */
return JB_ERR_CGI_PARAMS;
}
}
/* Success */
*pparam = param;
return JB_ERR_OK;
}
/*********************************************************************
*
* Function : get_number_param
*
* Description : Get a non-negative integer from the parameters
* passed to a CGI function.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : parameters = map of cgi parameters
* 3 : name = Name of CGI parameter to read
* 4 : pvalue = destination for value.
* Set to -1 on error.
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory
* JB_ERR_CGI_PARAMS if the parameter was not specified
* or is not valid.
*
*********************************************************************/
jb_err get_number_param(struct client_state *csp,
const struct map *parameters,
char *name,
unsigned *pvalue)
{
const char *param;
char ch;
unsigned value;
assert(csp);
assert(parameters);
assert(name);
assert(pvalue);
*pvalue = 0;
param = lookup(parameters, name);
if (!*param)
{
return JB_ERR_CGI_PARAMS;
}
/* We don't use atoi because I want to check this carefully... */
value = 0;
while ((ch = *param++) != '\0')
{
if ((ch < '0') || (ch > '9'))
{
return JB_ERR_CGI_PARAMS;
}
ch = (char)(ch - '0');
/* Note:
*
* defines UINT_MAX
*
* (UINT_MAX - ch) / 10 is the largest number that
* can be safely multiplied by 10 then have ch added.
*/
if (value > ((UINT_MAX - (unsigned)ch) / 10U))
{
return JB_ERR_CGI_PARAMS;
}
value = value * 10 + (unsigned)ch;
}
/* Success */
*pvalue = value;
return JB_ERR_OK;
}
/*********************************************************************
*
* Function : error_response
*
* Description : returns an http_response that explains the reason
* why a request failed.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : templatename = Which template should be used for the answer
*
* Returns : A http_response. If we run out of memory, this
* will be cgi_error_memory().
*
*********************************************************************/
struct http_response *error_response(struct client_state *csp,
const char *templatename)
{
jb_err err;
struct http_response *rsp;
struct map *exports = default_exports(csp, NULL);
char *path = NULL;
if (exports == NULL)
{
return cgi_error_memory();
}
if (NULL == (rsp = alloc_http_response()))
{
free_map(exports);
return cgi_error_memory();
}
#ifdef FEATURE_FORCE_LOAD
if (csp->flags & CSP_FLAG_FORCED)
{
path = strdup(FORCE_PREFIX);
}
else
#endif /* def FEATURE_FORCE_LOAD */
{
path = strdup("");
}
err = string_append(&path, csp->http->path);
if (!err) err = map(exports, "host", 1, html_encode(csp->http->host), 0);
if (!err) err = map(exports, "hostport", 1, html_encode(csp->http->hostport), 0);
if (!err) err = map(exports, "path", 1, html_encode_and_free_original(path), 0);
if (!err) err = map(exports, "protocol", 1, csp->http->ssl ? "https://" : "http://", 1);
if (!err)
{
err = map(exports, "host-ip", 1, html_encode(csp->http->host_ip_addr_str), 0);
if (err)
{
/* Some failures, like "404 no such domain", don't have an IP address. */
err = map(exports, "host-ip", 1, html_encode(csp->http->host), 0);
}
}
if (err)
{
free_map(exports);
free_http_response(rsp);
return cgi_error_memory();
}
if (!strcmp(templatename, "no-such-domain"))
{
rsp->status = strdup("404 No such domain");
rsp->crunch_reason = NO_SUCH_DOMAIN;
}
else if (!strcmp(templatename, "forwarding-failed"))
{
const struct forward_spec *fwd = forward_url(csp, csp->http);
char *socks_type = NULL;
if (fwd == NULL)
{
log_error(LOG_LEVEL_FATAL, "gateway spec is NULL. This shouldn't happen!");
/* Never get here - LOG_LEVEL_FATAL causes program exit */
}
/*
* XXX: While the template is called forwarding-failed,
* it currently only handles socks forwarding failures.
*/
assert(fwd != NULL);
assert(fwd->type != SOCKS_NONE);
/*
* Map failure reason, forwarding type and forwarder.
*/
if (NULL == csp->error_message)
{
/*
* Either we forgot to record the failure reason,
* or the memory allocation failed.
*/
log_error(LOG_LEVEL_ERROR, "Socks failure reason missing.");
csp->error_message = strdup("Failure reason missing. Check the log file for details.");
}
if (!err) err = map(exports, "gateway", 1, fwd->gateway_host, 1);
/*
* XXX: this is almost the same code as in cgi_show_url_info()
* and thus should be factored out and shared.
*/
switch (fwd->type)
{
case SOCKS_4:
socks_type = "socks4-";
break;
case SOCKS_4A:
socks_type = "socks4a-";
break;
case SOCKS_5:
socks_type = "socks5-";
break;
case SOCKS_5T:
socks_type = "socks5t-";
break;
default:
log_error(LOG_LEVEL_FATAL, "Unknown socks type: %d.", fwd->type);
}
if (!err) err = map(exports, "forwarding-type", 1, socks_type, 1);
if (!err) err = map(exports, "error-message", 1, html_encode(csp->error_message), 0);
if ((NULL == csp->error_message) || err)
{
free_map(exports);
free_http_response(rsp);
return cgi_error_memory();
}
rsp->status = strdup("503 Forwarding failure");
rsp->crunch_reason = FORWARDING_FAILED;
}
else if (!strcmp(templatename, "connect-failed"))
{
rsp->status = strdup("503 Connect failed");
rsp->crunch_reason = CONNECT_FAILED;
}
else if (!strcmp(templatename, "connection-timeout"))
{
rsp->status = strdup("504 Connection timeout");
rsp->crunch_reason = CONNECTION_TIMEOUT;
}
else if (!strcmp(templatename, "no-server-data"))
{
rsp->status = strdup("502 No data received from server or forwarder");
rsp->crunch_reason = NO_SERVER_DATA;
}
if (rsp->status == NULL)
{
free_map(exports);
free_http_response(rsp);
return cgi_error_memory();
}
err = template_fill_for_cgi(csp, templatename, exports, rsp);
if (err)
{
free_http_response(rsp);
return cgi_error_memory();
}
return finish_http_response(csp, rsp);
}
/*********************************************************************
*
* Function : cgi_error_disabled
*
* Description : CGI function that is called to generate an error
* response if the actions editor or toggle CGI are
* accessed despite having being disabled at compile-
* or run-time, or if the user followed an untrusted link
* to access a unsafe CGI feature that is only reachable
* through Privoxy directly.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : rsp = http_response data structure for output
*
* CGI Parameters : none
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err cgi_error_disabled(const struct client_state *csp,
struct http_response *rsp)
{
struct map *exports;
assert(csp);
assert(rsp);
if (NULL == (exports = default_exports(csp, "cgi-error-disabled")))
{
return JB_ERR_MEMORY;
}
if (map(exports, "url", 1, html_encode(csp->http->url), 0))
{
/* Not important enough to do anything */
log_error(LOG_LEVEL_ERROR, "Failed to fill in url.");
}
return template_fill_for_cgi(csp, "cgi-error-disabled", exports, rsp);
}
/*********************************************************************
*
* Function : cgi_init_error_messages
*
* Description : Call at the start of the program to initialize
* the error message used by cgi_error_memory().
*
* Parameters : N/A
*
* Returns : N/A
*
*********************************************************************/
void cgi_init_error_messages(void)
{
memset(cgi_error_memory_response, '\0', sizeof(*cgi_error_memory_response));
cgi_error_memory_response->head =
"HTTP/1.0 500 Internal Privoxy Error\r\n"
"Content-Type: text/html\r\n"
"\r\n";
cgi_error_memory_response->body =
"\n"
"\n"
" 500 Internal Privoxy Error\n"
" "
"\n"
"\n"
"
500 Internal Privoxy Error
\n"
"
Privoxy ran out of memory while processing your request.
\n"
"
Please contact your proxy administrator, or try again later
\n"
"\n"
"\n";
cgi_error_memory_response->head_length =
strlen(cgi_error_memory_response->head);
cgi_error_memory_response->content_length =
strlen(cgi_error_memory_response->body);
cgi_error_memory_response->crunch_reason = OUT_OF_MEMORY;
}
/*********************************************************************
*
* Function : cgi_error_memory
*
* Description : Called if a CGI function runs out of memory.
* Returns a statically-allocated error response.
*
* Parameters : N/A
*
* Returns : http_response data structure for output. This is
* statically allocated, for obvious reasons.
*
*********************************************************************/
struct http_response *cgi_error_memory(void)
{
/* assert that it's been initialized. */
assert(cgi_error_memory_response->head);
return cgi_error_memory_response;
}
/*********************************************************************
*
* Function : cgi_error_no_template
*
* Description : Almost-CGI function that is called if a template
* cannot be loaded. Note this is not a true CGI,
* it takes a template name rather than a map of
* parameters.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : rsp = http_response data structure for output
* 3 : template_name = Name of template that could not
* be loaded.
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err cgi_error_no_template(const struct client_state *csp,
struct http_response *rsp,
const char *template_name)
{
static const char status[] =
"500 Internal Privoxy Error";
static const char body_prefix[] =
"\n"
"\n"
" 500 Internal Privoxy Error\n"
" "
"\n"
"\n"
"
500 Internal Privoxy Error
\n"
"
Privoxy encountered an error while processing your request:
\n"
"
Could not load template file ";
static const char body_suffix[] =
" or one of its included components.
\n"
"
Please contact your proxy administrator.
\n"
"
If you are the proxy administrator, please put the required file(s)"
"in the (confdir)/templates directory. The "
"location of the (confdir) directory "
"is specified in the main Privoxy config "
"file. (It's typically the Privoxy install directory"
#ifndef _WIN32
", or /etc/privoxy/"
#endif /* ndef _WIN32 */
").
\n"
"\n"
"\n";
const size_t body_size = strlen(body_prefix) + strlen(template_name) + strlen(body_suffix) + 1;
assert(csp);
assert(rsp);
assert(template_name);
/* Reset rsp, if needed */
freez(rsp->status);
freez(rsp->head);
freez(rsp->body);
rsp->content_length = 0;
rsp->head_length = 0;
rsp->is_static = 0;
rsp->body = malloc_or_die(body_size);
strlcpy(rsp->body, body_prefix, body_size);
strlcat(rsp->body, template_name, body_size);
strlcat(rsp->body, body_suffix, body_size);
rsp->status = strdup(status);
if (rsp->status == NULL)
{
return JB_ERR_MEMORY;
}
return JB_ERR_OK;
}
/*********************************************************************
*
* Function : cgi_error_unknown
*
* Description : Almost-CGI function that is called if an unexpected
* error occurs in the top-level CGI dispatcher.
* In this context, "unexpected" means "anything other
* than JB_ERR_MEMORY or JB_ERR_CGI_PARAMS" - CGIs are
* expected to handle all other errors internally,
* since they can give more relavent error messages
* that way.
*
* Note this is not a true CGI, it takes an error
* code rather than a map of parameters.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : rsp = http_response data structure for output
* 3 : error_to_report = Error code to report.
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err cgi_error_unknown(const struct client_state *csp,
struct http_response *rsp,
jb_err error_to_report)
{
static const char status[] =
"500 Internal Privoxy Error";
static const char body_prefix[] =
"\n"
"\n"
" 500 Internal Privoxy Error\n"
" "
"\n"
"\n"
"
500 Internal Privoxy Error
\n"
"
Privoxy encountered an error while processing your request:
\n"
"\n"
"\n";
/* Includes room for larger error numbers in the future. */
const size_t body_size = sizeof(body_prefix) + sizeof(body_suffix) + 5;
assert(csp);
assert(rsp);
/* Reset rsp, if needed */
freez(rsp->status);
freez(rsp->head);
freez(rsp->body);
rsp->content_length = 0;
rsp->head_length = 0;
rsp->is_static = 0;
rsp->crunch_reason = INTERNAL_ERROR;
rsp->body = malloc_or_die(body_size);
snprintf(rsp->body, body_size, "%s%d%s", body_prefix, error_to_report, body_suffix);
rsp->status = strdup(status);
if (rsp->status == NULL)
{
return JB_ERR_MEMORY;
}
return JB_ERR_OK;
}
/*********************************************************************
*
* Function : cgi_error_bad_param
*
* Description : CGI function that is called if the parameters
* (query string) for a CGI were wrong.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : rsp = http_response data structure for output
*
* CGI Parameters : none
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err cgi_error_bad_param(const struct client_state *csp,
struct http_response *rsp)
{
struct map *exports;
assert(csp);
assert(rsp);
if (NULL == (exports = default_exports(csp, NULL)))
{
return JB_ERR_MEMORY;
}
return template_fill_for_cgi(csp, "cgi-error-bad-param", exports, rsp);
}
/*********************************************************************
*
* Function : cgi_redirect
*
* Description : CGI support function to generate a HTTP redirect
* message
*
* Parameters :
* 1 : rsp = http_response data structure for output
* 2 : target = string with the target URL
*
* CGI Parameters : None
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err cgi_redirect (struct http_response * rsp, const char *target)
{
jb_err err;
assert(rsp);
assert(target);
err = enlist_unique_header(rsp->headers, "Location", target);
rsp->status = strdup("302 Local Redirect from Privoxy");
if (rsp->status == NULL)
{
return JB_ERR_MEMORY;
}
return err;
}
/*********************************************************************
*
* Function : add_help_link
*
* Description : Produce a copy of the string given as item,
* embedded in an HTML link to its corresponding
* section (item name in uppercase) in the actions
* chapter of the user manual, (whose URL is given in
* the config and defaults to our web site).
*
* FIXME: I currently only work for actions, and would
* like to be generalized for other topics.
*
* Parameters :
* 1 : item = item (will NOT be free()d.)
* It is assumed to be HTML-safe.
* 2 : config = The current configuration.
*
* Returns : String with item embedded in link, or NULL on
* out-of-memory
*
*********************************************************************/
char *add_help_link(const char *item,
struct configuration_spec *config)
{
char *result;
if (!item) return NULL;
result = strdup("usermanual, "file://", 7) ||
!strncmpic(config->usermanual, "http", 4))
{
string_append(&result, config->usermanual);
}
else
{
string_append(&result, "http://");
string_append(&result, CGI_SITE_2_HOST);
string_append(&result, "/user-manual/");
}
string_append(&result, ACTIONS_HELP_PREFIX);
string_join (&result, string_toupper(item));
string_append(&result, "\">");
string_append(&result, item);
string_append(&result, "");
return result;
}
/*********************************************************************
*
* Function : get_http_time
*
* Description : Get the time in a format suitable for use in a
* HTTP header - e.g.:
* "Sun, 06 Nov 1994 08:49:37 GMT"
*
* Parameters :
* 1 : time_offset = Time returned will be current time
* plus this number of seconds.
* 2 : buf = Destination for result.
* 3 : buffer_size = Size of the buffer above. Must be big
* enough to hold 29 characters plus a
* trailing zero.
*
* Returns : N/A
*
*********************************************************************/
void get_http_time(int time_offset, char *buf, size_t buffer_size)
{
struct tm *t;
time_t current_time;
#if defined(HAVE_GMTIME_R)
struct tm dummy;
#endif
assert(buf);
assert(buffer_size > (size_t)29);
time(¤t_time);
current_time += time_offset;
/* get and save the gmt */
#if HAVE_GMTIME_R
t = gmtime_r(¤t_time, &dummy);
#elif defined(MUTEX_LOCKS_AVAILABLE)
privoxy_mutex_lock(&gmtime_mutex);
t = gmtime(¤t_time);
privoxy_mutex_unlock(&gmtime_mutex);
#else
t = gmtime(¤t_time);
#endif
strftime(buf, buffer_size, "%a, %d %b %Y %H:%M:%S GMT", t);
}
/*********************************************************************
*
* Function : get_locale_time
*
* Description : Get the time in a date(1)-like format
* according to the current locale - e.g.:
* "Fri Aug 29 19:37:12 CEST 2008"
*
* XXX: Should we allow the user to change the format?
*
* Parameters :
* 1 : buf = Destination for result.
* 2 : buffer_size = Size of the buffer above. Must be big
* enough to hold 29 characters plus a
* trailing zero.
*
* Returns : N/A
*
*********************************************************************/
static void get_locale_time(char *buf, size_t buffer_size)
{
struct tm *timeptr;
time_t current_time;
#if defined(HAVE_LOCALTIME_R)
struct tm dummy;
#endif
assert(buf);
assert(buffer_size > (size_t)29);
time(¤t_time);
#if HAVE_LOCALTIME_R
timeptr = localtime_r(¤t_time, &dummy);
#elif defined(MUTEX_LOCKS_AVAILABLE)
privoxy_mutex_lock(&localtime_mutex);
timeptr = localtime(¤t_time);
privoxy_mutex_unlock(&localtime_mutex);
#else
timeptr = localtime(¤t_time);
#endif
strftime(buf, buffer_size, "%a %b %d %X %Z %Y", timeptr);
}
#ifdef FEATURE_COMPRESSION
/*********************************************************************
*
* Function : compress_buffer
*
* Description : Compresses the content of a buffer with zlib's deflate
* Allocates a new buffer for the result, free'ing it is
* up to the caller.
*
* Parameters :
* 1 : buffer = buffer whose content should be compressed
* 2 : buffer_length = length of the buffer
* 3 : compression_level = compression level for compress2()
*
* Returns : NULL on error, otherwise a pointer to the compressed
* content of the input buffer.
*
*********************************************************************/
char *compress_buffer(char *buffer, size_t *buffer_length, int compression_level)
{
char *compressed_buffer;
uLongf new_length;
assert(-1 <= compression_level && compression_level <= 9);
/* Let zlib figure out the maximum length of the compressed data */
new_length = compressBound((uLongf)*buffer_length);
compressed_buffer = malloc_or_die(new_length);
if (Z_OK != compress2((Bytef *)compressed_buffer, &new_length,
(Bytef *)buffer, *buffer_length, compression_level))
{
log_error(LOG_LEVEL_ERROR,
"compress2() failed. Buffer size: %d, compression level: %d.",
new_length, compression_level);
freez(compressed_buffer);
return NULL;
}
log_error(LOG_LEVEL_RE_FILTER,
"Compressed content from %d to %d bytes. Compression level: %d",
*buffer_length, new_length, compression_level);
*buffer_length = (size_t)new_length;
return compressed_buffer;
}
#endif
/*********************************************************************
*
* Function : finish_http_response
*
* Description : Fill in the missing headers in an http response,
* and flatten the headers to an http head.
* For HEAD requests the body is freed once
* the Content-Length header is set.
*
* Parameters :
* 1 : rsp = pointer to http_response to be processed
*
* Returns : A http_response, usually the rsp parameter.
* On error, free()s rsp and returns cgi_error_memory()
*
*********************************************************************/
struct http_response *finish_http_response(struct client_state *csp, struct http_response *rsp)
{
char buf[BUFFER_SIZE];
jb_err err;
/* Special case - do NOT change this statically allocated response,
* which is ready for output anyway.
*/
if (rsp == cgi_error_memory_response)
{
return rsp;
}
/*
* Fill in the HTTP Status, using HTTP/1.1
* unless the client asked for HTTP/1.0.
*/
snprintf(buf, sizeof(buf), "%s %s",
strcmpic(csp->http->ver, "HTTP/1.0") ? "HTTP/1.1" : "HTTP/1.0",
rsp->status ? rsp->status : "200 OK");
err = enlist_first(rsp->headers, buf);
/*
* Set the Content-Length
*/
if (rsp->content_length == 0)
{
rsp->content_length = rsp->body ? strlen(rsp->body) : 0;
}
#ifdef FEATURE_COMPRESSION
if (!err && (csp->flags & CSP_FLAG_CLIENT_SUPPORTS_DEFLATE)
&& (rsp->content_length > LOWER_LENGTH_LIMIT_FOR_COMPRESSION))
{
char *compressed_content;
compressed_content = compress_buffer(rsp->body, &rsp->content_length,
csp->config->compression_level);
if (NULL != compressed_content)
{
freez(rsp->body);
rsp->body = compressed_content;
err = enlist_unique_header(rsp->headers, "Content-Encoding", "deflate");
}
}
#endif
if (!err)
{
snprintf(buf, sizeof(buf), "Content-Length: %d", (int)rsp->content_length);
/*
* Signal serve() that the client will be able to figure out
* the end of the response without having to close the connection.
*/
csp->flags |= CSP_FLAG_SERVER_CONTENT_LENGTH_SET;
err = enlist(rsp->headers, buf);
}
if (0 == strcmpic(csp->http->gpc, "head"))
{
/*
* The client only asked for the head. Dispose
* the body and log an offensive message.
*
* While it may seem to be a bit inefficient to
* prepare the body if it isn't needed, it's the
* only way to get the Content-Length right for
* dynamic pages. We could have disposed the body
* earlier, but not without duplicating the
* Content-Length setting code above.
*/
log_error(LOG_LEVEL_CGI, "Preparing to give head to %s.", csp->ip_addr_str);
freez(rsp->body);
rsp->content_length = 0;
}
if (strncmpic(rsp->status, "302", 3))
{
/*
* If it's not a redirect without any content,
* set the Content-Type to text/html if it's
* not already specified.
*/
if (!err) err = enlist_unique(rsp->headers, "Content-Type: text/html", 13);
}
/*
* Fill in the rest of the default headers:
*
* Date: set to current date/time.
* Last-Modified: set to date/time the page was last changed.
* Expires: set to date/time page next needs reloading.
* Cache-Control: set to "no-cache" if applicable.
*
* See http://www.w3.org/Protocols/rfc2068/rfc2068
*/
if (rsp->is_static)
{
/*
* Set Expires to about 10 min into the future so it'll get reloaded
* occasionally, e.g. if Privoxy gets upgraded.
*/
if (!err)
{
get_http_time(0, buf, sizeof(buf));
err = enlist_unique_header(rsp->headers, "Date", buf);
}
/* Some date in the past. */
if (!err) err = enlist_unique_header(rsp->headers, "Last-Modified", "Sat, 17 Jun 2000 12:00:00 GMT");
if (!err)
{
get_http_time(10 * 60, buf, sizeof(buf)); /* 10 * 60sec = 10 minutes */
err = enlist_unique_header(rsp->headers, "Expires", buf);
}
}
else if (!strncmpic(rsp->status, "302", 3))
{
get_http_time(0, buf, sizeof(buf));
if (!err) err = enlist_unique_header(rsp->headers, "Date", buf);
}
else
{
/*
* Setting "Cache-Control" to "no-cache" and "Expires" to
* the current time doesn't exactly forbid caching, it just
* requires the client to revalidate the cached copy.
*
* If a temporary problem occurs and the user tries again after
* getting Privoxy's error message, a compliant browser may set the
* If-Modified-Since header with the content of the error page's
* Last-Modified header. More often than not, the document on the server
* is older than Privoxy's error message, the server would send status code
* 304 and the browser would display the outdated error message again and again.
*
* For documents delivered with status code 403, 404 and 503 we set "Last-Modified"
* to Tim Berners-Lee's birthday, which predates the age of any page on the web
* and can be safely used to "revalidate" without getting a status code 304.
*
* There is no need to let the useless If-Modified-Since header reach the
* server, it is therefore stripped by client_if_modified_since in parsers.c.
*/
if (!err) err = enlist_unique_header(rsp->headers, "Cache-Control", "no-cache");
get_http_time(0, buf, sizeof(buf));
if (!err) err = enlist_unique_header(rsp->headers, "Date", buf);
if (!strncmpic(rsp->status, "403", 3)
|| !strncmpic(rsp->status, "404", 3)
|| !strncmpic(rsp->status, "502", 3)
|| !strncmpic(rsp->status, "503", 3)
|| !strncmpic(rsp->status, "504", 3))
{
if (!err) err = enlist_unique_header(rsp->headers, "Last-Modified", "Wed, 08 Jun 1955 12:00:00 GMT");
}
else
{
if (!err) err = enlist_unique_header(rsp->headers, "Last-Modified", buf);
}
if (!err) err = enlist_unique_header(rsp->headers, "Expires", "Sat, 17 Jun 2000 12:00:00 GMT");
if (!err) err = enlist_unique_header(rsp->headers, "Pragma", "no-cache");
}
if (!err && (!(csp->flags & CSP_FLAG_CLIENT_CONNECTION_KEEP_ALIVE)
|| (csp->flags & CSP_FLAG_SERVER_SOCKET_TAINTED)))
{
err = enlist_unique_header(rsp->headers, "Connection", "close");
}
/*
* Write the head
*/
if (err || (NULL == (rsp->head = list_to_text(rsp->headers))))
{
free_http_response(rsp);
return cgi_error_memory();
}
rsp->head_length = strlen(rsp->head);
return rsp;
}
/*********************************************************************
*
* Function : alloc_http_response
*
* Description : Allocates a new http_response structure.
*
* Parameters : N/A
*
* Returns : pointer to a new http_response, or NULL.
*
*********************************************************************/
struct http_response *alloc_http_response(void)
{
return (struct http_response *) zalloc(sizeof(struct http_response));
}
/*********************************************************************
*
* Function : free_http_response
*
* Description : Free the memory occupied by an http_response
* and its depandant structures.
*
* Parameters :
* 1 : rsp = pointer to http_response to be freed
*
* Returns : N/A
*
*********************************************************************/
void free_http_response(struct http_response *rsp)
{
/*
* Must special case cgi_error_memory_response, which is never freed.
*/
if (rsp && (rsp != cgi_error_memory_response))
{
freez(rsp->status);
freez(rsp->head);
freez(rsp->body);
destroy_list(rsp->headers);
free(rsp);
}
}
/*********************************************************************
*
* Function : template_load
*
* Description : CGI support function that loads a given HTML
* template, ignoring comment lines and following
* #include statements up to a depth of 1.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : template_ptr = Destination for pointer to loaded
* template text.
* 3 : templatename = name of the HTML template to be used
* 4 : recursive = Flag set if this function calls itself
* following an #include statament
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
* JB_ERR_FILE if the template file cannot be read
*
*********************************************************************/
jb_err template_load(const struct client_state *csp, char **template_ptr,
const char *templatename, int recursive)
{
jb_err err;
char *templates_dir_path;
char *full_path;
char *file_buffer;
char *included_module;
const char *p;
FILE *fp;
char buf[BUFFER_SIZE];
assert(csp);
assert(template_ptr);
assert(templatename);
*template_ptr = NULL;
/* Validate template name. Paranoia. */
for (p = templatename; *p != 0; p++)
{
if ( ((*p < 'a') || (*p > 'z'))
&& ((*p < 'A') || (*p > 'Z'))
&& ((*p < '0') || (*p > '9'))
&& (*p != '-')
&& (*p != '.'))
{
/* Illegal character */
return JB_ERR_FILE;
}
}
/*
* Generate full path using either templdir
* or confdir/templates as base directory.
*/
if (NULL != csp->config->templdir)
{
templates_dir_path = strdup(csp->config->templdir);
}
else
{
templates_dir_path = make_path(csp->config->confdir, "templates");
}
if (templates_dir_path == NULL)
{
log_error(LOG_LEVEL_ERROR, "Out of memory while generating template path for %s.",
templatename);
return JB_ERR_MEMORY;
}
full_path = make_path(templates_dir_path, templatename);
free(templates_dir_path);
if (full_path == NULL)
{
log_error(LOG_LEVEL_ERROR, "Out of memory while generating full template path for %s.",
templatename);
return JB_ERR_MEMORY;
}
/* Allocate buffer */
file_buffer = strdup("");
if (file_buffer == NULL)
{
log_error(LOG_LEVEL_ERROR, "Not enough free memory to buffer %s.", full_path);
free(full_path);
return JB_ERR_MEMORY;
}
/* Open template file */
if (NULL == (fp = fopen(full_path, "r")))
{
log_error(LOG_LEVEL_ERROR, "Cannot open template file %s: %E", full_path);
free(full_path);
free(file_buffer);
return JB_ERR_FILE;
}
free(full_path);
/*
* Read the file, ignoring comments, and honoring #include
* statements, unless we're already called recursively.
*
* XXX: The comment handling could break with lines lengths > sizeof(buf).
* This is unlikely in practise.
*/
while (fgets(buf, sizeof(buf), fp))
{
if (!recursive && !strncmp(buf, "#include ", 9))
{
if (JB_ERR_OK != (err = template_load(csp, &included_module, chomp(buf + 9), 1)))
{
free(file_buffer);
fclose(fp);
return err;
}
if (string_join(&file_buffer, included_module))
{
fclose(fp);
return JB_ERR_MEMORY;
}
continue;
}
/* skip lines starting with '#' */
if (*buf == '#')
{
continue;
}
if (string_append(&file_buffer, buf))
{
fclose(fp);
return JB_ERR_MEMORY;
}
}
fclose(fp);
*template_ptr = file_buffer;
return JB_ERR_OK;
}
/*********************************************************************
*
* Function : template_fill
*
* Description : CGI support function that fills in a pre-loaded
* HTML template by replacing @name@ with value using
* pcrs, for each item in the output map.
*
* Note that a leading '$' character in the export map's
* values will be stripped and toggle on backreference
* interpretation.
*
* Parameters :
* 1 : template_ptr = IN: Template to be filled out.
* Will be free()d.
* OUT: Filled out template.
* Caller must free().
* 2 : exports = map with fill in symbol -> name pairs
*
* Returns : JB_ERR_OK on success (and for uncritical errors)
* JB_ERR_MEMORY on out-of-memory error
*
*********************************************************************/
jb_err template_fill(char **template_ptr, const struct map *exports)
{
struct map_entry *m;
pcrs_job *job;
char buf[BUFFER_SIZE];
char *tmp_out_buffer;
char *file_buffer;
size_t size;
int error;
const char *flags;
assert(template_ptr);
assert(*template_ptr);
assert(exports);
file_buffer = *template_ptr;
size = strlen(file_buffer) + 1;
/*
* Assemble pcrs joblist from exports map
*/
for (m = exports->first; m != NULL; m = m->next)
{
if (*m->name == '$')
{
/*
* First character of name is '$', so remove this flag
* character and allow backreferences ($1 etc) in the
* "replace with" text.
*/
snprintf(buf, sizeof(buf), "%s", m->name + 1);
flags = "sigU";
}
else
{
/*
* Treat the "replace with" text as a literal string -
* no quoting needed, no backreferences allowed.
* ("Trivial" ['T'] flag).
*/
flags = "sigTU";
/* Enclose name in @@ */
snprintf(buf, sizeof(buf), "@%s@", m->name);
}
log_error(LOG_LEVEL_CGI, "Substituting: s/%s/%s/%s", buf, m->value, flags);
/* Make and run job. */
job = pcrs_compile(buf, m->value, flags, &error);
if (job == NULL)
{
if (error == PCRS_ERR_NOMEM)
{
free(file_buffer);
*template_ptr = NULL;
return JB_ERR_MEMORY;
}
else
{
log_error(LOG_LEVEL_ERROR, "Error compiling template fill job %s: %d", m->name, error);
/* Hope it wasn't important and silently ignore the invalid job */
}
}
else
{
error = pcrs_execute(job, file_buffer, size, &tmp_out_buffer, &size);
pcrs_free_job(job);
if (NULL == tmp_out_buffer)
{
*template_ptr = NULL;
return JB_ERR_MEMORY;
}
if (error < 0)
{
/*
* Substitution failed, keep the original buffer,
* log the problem and ignore it.
*
* The user might see some unresolved @CGI_VARIABLES@,
* but returning a special CGI error page seems unreasonable
* and could mask more important error messages.
*/
free(tmp_out_buffer);
log_error(LOG_LEVEL_ERROR, "Failed to execute s/%s/%s/%s. %s",
buf, m->value, flags, pcrs_strerror(error));
}
else
{
/* Substitution succeeded, use modified buffer. */
free(file_buffer);
file_buffer = tmp_out_buffer;
}
}
}
/*
* Return
*/
*template_ptr = file_buffer;
return JB_ERR_OK;
}
/*********************************************************************
*
* Function : template_fill_for_cgi
*
* Description : CGI support function that loads a HTML template
* and fills it in. Handles file-not-found errors
* by sending a HTML error message. For convenience,
* this function also frees the passed "exports" map.
*
* Parameters :
* 1 : csp = Client state
* 2 : templatename = name of the HTML template to be used
* 3 : exports = map with fill in symbol -> name pairs.
* Will be freed by this function.
* 4 : rsp = Response structure to fill in.
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error
*
*********************************************************************/
jb_err template_fill_for_cgi(const struct client_state *csp,
const char *templatename,
struct map *exports,
struct http_response *rsp)
{
jb_err err;
assert(csp);
assert(templatename);
assert(exports);
assert(rsp);
err = template_load(csp, &rsp->body, templatename, 0);
if (err == JB_ERR_FILE)
{
free_map(exports);
return cgi_error_no_template(csp, rsp, templatename);
}
else if (err)
{
free_map(exports);
return err; /* JB_ERR_MEMORY */
}
err = template_fill(&rsp->body, exports);
free_map(exports);
return err;
}
/*********************************************************************
*
* Function : default_exports
*
* Description : returns a struct map list that contains exports
* which are common to all CGI functions.
*
* Parameters :
* 1 : csp = Current client state (buffers, headers, etc...)
* 2 : caller = name of CGI who calls us and which should
* be excluded from the generated menu. May be
* NULL.
* Returns : NULL if no memory, else a new map. Caller frees.
*
*********************************************************************/
struct map *default_exports(const struct client_state *csp, const char *caller)
{
char buf[30];
jb_err err;
struct map * exports;
int local_help_exists = 0;
char *ip_address = NULL;
char *port = NULL;
char *hostname = NULL;
assert(csp);
exports = new_map();
if (exports == NULL)
{
return NULL;
}
if (csp->config->hostname)
{
get_host_information(csp->cfd, &ip_address, &port, NULL);
hostname = strdup(csp->config->hostname);
}
else
{
get_host_information(csp->cfd, &ip_address, &port, &hostname);
}
err = map(exports, "version", 1, html_encode(VERSION), 0);
get_locale_time(buf, sizeof(buf));
if (!err) err = map(exports, "time", 1, html_encode(buf), 0);
if (!err) err = map(exports, "my-ip-address", 1, html_encode(ip_address ? ip_address : "unknown"), 0);
freez(ip_address);
if (!err) err = map(exports, "my-port", 1, html_encode(port ? port : "unknown"), 0);
freez(port);
if (!err) err = map(exports, "my-hostname", 1, html_encode(hostname ? hostname : "unknown"), 0);
freez(hostname);
if (!err) err = map(exports, "homepage", 1, html_encode(HOME_PAGE_URL), 0);
if (!err) err = map(exports, "default-cgi", 1, html_encode(CGI_PREFIX), 0);
if (!err) err = map(exports, "menu", 1, make_menu(caller, csp->config->feature_flags), 0);
if (!err) err = map(exports, "code-status", 1, CODE_STATUS, 1);
if (!strncmpic(csp->config->usermanual, "file://", 7) ||
!strncmpic(csp->config->usermanual, "http", 4))
{
/* Manual is located somewhere else, just link to it. */
if (!err) err = map(exports, "user-manual", 1, html_encode(csp->config->usermanual), 0);
}
else
{
/* Manual is delivered by Privoxy. */
if (!err) err = map(exports, "user-manual", 1, html_encode(CGI_PREFIX"user-manual/"), 0);
}
if (!err) err = map(exports, "actions-help-prefix", 1, ACTIONS_HELP_PREFIX ,1);
#ifdef FEATURE_TOGGLE
if (!err) err = map_conditional(exports, "enabled-display", global_toggle_state);
#else
if (!err) err = map_block_killer(exports, "can-toggle");
#endif
if (!strcmp(CODE_STATUS, "stable"))
{
if (!err) err = map_block_killer(exports, "unstable");
}
if (csp->config->admin_address != NULL)
{
if (!err) err = map(exports, "admin-address", 1, html_encode(csp->config->admin_address), 0);
local_help_exists = 1;
}
else
{
if (!err) err = map_block_killer(exports, "have-adminaddr-info");
}
if (csp->config->proxy_info_url != NULL)
{
if (!err) err = map(exports, "proxy-info-url", 1, html_encode(csp->config->proxy_info_url), 0);
local_help_exists = 1;
}
else
{
if (!err) err = map_block_killer(exports, "have-proxy-info");
}
if (local_help_exists == 0)
{
if (!err) err = map_block_killer(exports, "have-help-info");
}
if (err)
{
free_map(exports);
return NULL;
}
return exports;
}
/*********************************************************************
*
* Function : map_block_killer
*
* Description : Convenience function.
* Adds a "killer" for the conditional HTML-template
* block , i.e. a substitution of the regex
* "if--start.*if--end" to the given
* export list.
*
* Parameters :
* 1 : exports = map to extend
* 2 : name = name of conditional block
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err map_block_killer(struct map *exports, const char *name)
{
char buf[1000]; /* Will do, since the names are hardwired */
assert(exports);
assert(name);
assert(strlen(name) < (size_t)490);
snprintf(buf, sizeof(buf), "if-%s-start.*if-%s-end", name, name);
return map(exports, buf, 1, "", 1);
}
/*********************************************************************
*
* Function : map_block_keep
*
* Description : Convenience function. Removes the markers used
* by map-block-killer, to save a few bytes.
* i.e. removes "@if--start@" and "@if--end@"
*
* Parameters :
* 1 : exports = map to extend
* 2 : name = name of conditional block
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err map_block_keep(struct map *exports, const char *name)
{
jb_err err;
char buf[500]; /* Will do, since the names are hardwired */
assert(exports);
assert(name);
assert(strlen(name) < (size_t)490);
snprintf(buf, sizeof(buf), "if-%s-start", name);
err = map(exports, buf, 1, "", 1);
if (err)
{
return err;
}
snprintf(buf, sizeof(buf), "if-%s-end", name);
return map(exports, buf, 1, "", 1);
}
/*********************************************************************
*
* Function : map_conditional
*
* Description : Convenience function.
* Adds an "if-then-else" for the conditional HTML-template
* block , i.e. a substitution of the form:
* @if--then@
* True text
* @else-not-@
* False text
* @endif-@
*
* The control structure and one of the alternatives
* will be hidden.
*
* Parameters :
* 1 : exports = map to extend
* 2 : name = name of conditional block
* 3 : choose_first = nonzero for first, zero for second.
*
* Returns : JB_ERR_OK on success
* JB_ERR_MEMORY on out-of-memory error.
*
*********************************************************************/
jb_err map_conditional(struct map *exports, const char *name, int choose_first)
{
char buf[1000]; /* Will do, since the names are hardwired */
jb_err err;
assert(exports);
assert(name);
assert(strlen(name) < (size_t)480);
snprintf(buf, sizeof(buf), (choose_first
? "else-not-%s@.*@endif-%s"
: "if-%s-then@.*@else-not-%s"),
name, name);
err = map(exports, buf, 1, "", 1);
if (err)
{
return err;
}
snprintf(buf, sizeof(buf), (choose_first ? "if-%s-then" : "endif-%s"), name);
return map(exports, buf, 1, "", 1);
}
/*********************************************************************
*
* Function : make_menu
*
* Description : Returns an HTML-formatted menu of the available
* unhidden CGIs, excluding the one given in
* and the toggle CGI if toggling is disabled.
*
* Parameters :
* 1 : self = name of CGI to leave out, can be NULL for
* complete listing.
* 2 : feature_flags = feature bitmap from csp->config
*
*
* Returns : menu string, or NULL on out-of-memory error.
*
*********************************************************************/
char *make_menu(const char *self, const unsigned feature_flags)
{
const struct cgi_dispatcher *d;
char *result = strdup("");
if (self == NULL)
{
self = "NO-SUCH-CGI!";
}
/* List available unhidden CGI's and export as "other-cgis" */
for (d = cgi_dispatchers; d->name; d++)
{
#ifdef FEATURE_TOGGLE
if (!(feature_flags & RUNTIME_FEATURE_CGI_TOGGLE) && !strcmp(d->name, "toggle"))
{
/*
* Suppress the toggle link if remote toggling is disabled.
*/
continue;
}
#endif /* def FEATURE_TOGGLE */
if (d->description && strcmp(d->name, self))
{
char *html_encoded_prefix;
/*
* Line breaks would be great, but break
* the "blocked" template's JavaScript.
*/
string_append(&result, "
\n");
return ret;
}
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./configure.in 000640 001751 001751 00000077001 12114164002 016267 0 ustar 00fk fk 000000 000000 dnl Process this file with autoconf to produce a configure script.
dnl
dnl $Id: configure.in,v 1.179 2013/03/01 17:40:18 fabiankeil Exp $
dnl
dnl Written by and Copyright (C) 2001-2010 the
dnl Privoxy team. http://www.privoxy.org/
dnl
dnl Based on the Internet Junkbuster originally written
dnl by and Copyright (C) 1997 Anonymous Coders and
dnl Junkbusters Corporation. http://www.junkbusters.com
dnl
dnl This program is free software; you can redistribute it
dnl and/or modify it under the terms of the GNU General
dnl Public License as published by the Free Software
dnl Foundation; either version 2 of the License, or (at
dnl your option) any later version.
dnl
dnl This program is distributed in the hope that it will
dnl be useful, but WITHOUT ANY WARRANTY; without even the
dnl implied warranty of MERCHANTABILITY or FITNESS FOR A
dnl PARTICULAR PURPOSE. See the GNU General Public
dnl License for more details.
dnl
dnl The GNU General Public License should be included with
dnl this file. If not, you can view it at
dnl http://www.gnu.org/copyleft/gpl.html
dnl or write to the Free Software Foundation, Inc., 59
dnl Temple Place - Suite 330, Boston, MA 02111-1307, USA.
dnl
dnl =================================================================
dnl AutoConf Initialization
dnl =================================================================
AC_REVISION($Revision: 1.179 $)
AC_INIT(jcc.c)
if test ! -f config.h.in; then
echo "You need to run autoheader first. "
echo -n "Shall I do this for you now? (y/n) "
read answer
if test "$answer" != "y"; then
exit 1
else
autoheader
fi
fi
AC_CONFIG_HEADER([config.h])
AC_CANONICAL_HOST
dodk=auto
DKPREFIX=none
AC_ARG_WITH(docbook, dnl
--with-docbook=[[yes|no|directory]]
Enable docbook documentation creation
(default = yes, for gnu and linux),[dnl
case "$with_docbook" in
yes) dodk=yes;;
no) dodk=no;;
*)
dodk=yes
DKPREFIX=$withval
;;
esac
])
DB2HTML=false
AC_ARG_WITH(db2html, dnl
--with-db2html=
Set the location of the docbook to html converter
(default = search),[dnl
DB2HTML=$withval
])
dnl =================================================================
dnl Application version number
dnl =================================================================
VERSION_MAJOR=3
VERSION_MINOR=0
VERSION_POINT=21
CODE_STATUS="stable"
dnl CODE_STATUS can be "alpha", "beta", "stable" or "UNRELEASED",
dnl and will be used for CGI output. Increment version number and
dnl set status to "UNRELEASED" whenever CVS differs from the last
dnl release and no new release is near.
dnl =================================================================
dnl Substitute the version numbers
dnl =================================================================
AC_SUBST(VERSION_MAJOR)
AC_SUBST(VERSION_MINOR)
AC_SUBST(VERSION_POINT)
AC_SUBST(CODE_STATUS)
dnl
AC_DEFINE_UNQUOTED(VERSION_MAJOR,${VERSION_MAJOR})
AC_DEFINE_UNQUOTED(VERSION_MINOR,${VERSION_MINOR})
AC_DEFINE_UNQUOTED(VERSION_POINT,${VERSION_POINT})
AC_DEFINE_UNQUOTED(VERSION,"${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_POINT}")
AC_DEFINE_UNQUOTED(CODE_STATUS,"${CODE_STATUS}")
dnl =================================================================
dnl Checks for programs needed to build.
dnl =================================================================
dnl Keep AC_PROG_CC from setting its own defaults:
if test "X$CFLAGS" = "X"; then
CFLAGS=" "
fi
AC_PROG_CC
AC_PROG_CPP
AC_PROG_INSTALL
AC_PROG_LN_S
AC_PROG_MAKE_SET
AC_PROG_AWK
AC_CHECK_PROG(GDB,gdb,yes,no)
AC_PATH_PROG(BGROUPS,groups,no,$PATH:/bin:/usr/bin:/usr/local/bin)
AC_PATH_PROG(ID,id,no,$PATH:/bin:/usr/bin:/usr/local/bin)
AC_SUBST(ID)
AC_SUBST(BGROUPS)
dnl =================================================================
dnl debug, gcc and gdb support
dnl =================================================================
AC_ARG_WITH(debug,
[ --with-debug Enable debug mode],
[
if test "x$withval" != "xno" ; then
if test $ac_cv_prog_cc_g = yes; then
if test "$GCC" = yes; then
if test "$GDB"; then
CFLAGS="$CFLAGS -ggdb"
else
CFLAGS="$CFLAGS -g"
fi
CFLAGS="$CFLAGS -Wshadow -Wconversion"
else
CFLAGS="$CFLAGS -g"
fi
fi
fi
],
[
if test "X$CFLAGS" = "X "; then # if CFLAGS were unset (see above)
if test "$GCC" = yes; then
CFLAGS="-O2"
fi
fi
]
)
dnl =================================================================
dnl Check for user and group validity
dnl =================================================================
if test "$EMXOS2" = yes || test "$host_os" = haiku; then
echo "Skipping user and group validity stuff.";
else
$ID privoxy >/dev/null 2>/dev/null
if test $? -ne 0 ; then
AC_MSG_WARN(There is no user 'privoxy' on this system)
fi
AC_MSG_CHECKING([for user])
AC_ARG_WITH(user,
[ --with-user=privoxy Set user under which privoxy will run],
[
if test "x$withval" != "xyes"; then
if test $ID = no ; then
AC_MSG_ERROR(There is no 'id' program on this system)
else
AC_MSG_RESULT($with_user)
$ID $with_user 2>/dev/null >/dev/null
if test $? -eq 0 ; then
USER=$with_user;
else
AC_MSG_ERROR(There is no user '$with_user' on this system)
fi
fi
else
AC_MSG_ERROR(We need a user if you give me this parameter)
fi
],
[
if test $ID = no ; then
AC_MSG_ERROR(There is no 'id' programm on this system)
else
AC_MSG_RESULT(none specified)
USER=$with_user
fi
]
)
AC_SUBST(USER)
AC_MSG_CHECKING([for group])
AC_ARG_WITH(group,
[ --with-group=privoxy Set group for privoxy],
[
if test "x$withval" != "xyes"; then
if test $BGROUPS = no ; then
AC_MSG_ERROR(There is no 'groups' program on this system)
else
AC_MSG_RESULT($with_group)
$BGROUPS $USER >/dev/null
if test $? -eq 0 ; then
# FIXME: this fails if valid group, but not first group
# listed.
if test "$with_group" != "`$BGROUPS $USER | sed 's/.*: //' 2>/dev/null |$AWK '{print $1}'`" ; then
AC_MSG_ERROR(The given value '$withval' does not match group entry)
else
GROUP=$with_group;
fi
else
AC_MSG_ERROR(There is no group entry for user '$USER')
fi
fi
else
AC_MSG_ERROR(We need a group if you give me this parameter)
fi
],
[
if test $BGROUPS = no ; then
AC_MSG_ERROR(There is no 'groups' programm on this system)
else
AC_MSG_RESULT(none specified)
GROUP=$with_group;
fi
]
)
AC_SUBST(GROUP)
fi
dnl =================================================================
dnl additional gcc flags
dnl =================================================================
dnl
if test "$GCC"; then
if test "$host" != "powerpc-unknown-amigaos"; then
CFLAGS="-pipe $CFLAGS"
fi
fi
dnl =================================================================
dnl Build type
dnl =================================================================
dnl
dnl Must do this first.
dnl
dnl Reason: This sets CFLAGS in order to switch the Cygwin compiler
dnl into Cygwin or MinGW32 modes. Depending on the mode selected,
dnl the compiler will use completely different sets of library
dnl and include files.
dnl
dnl =================================================================
AC_MINGW32
AC_CYGWIN
if test "$MINGW32" = "yes"; then
target_type=mingw
else
if test "$CYGWIN" = "yes"; then
target_type=cygwin
else
target_type=unix
fi
fi
if test $dodk = auto; then
dodk=no
if test $target_type = unix; then
case "$host_os" in
linux* | gnu*)
dodk=yes
;;
esac
fi
fi
dnl Decide what to do based on target_type
dnl Note: PTHREAD_LIB is always set, even if pthread is disabled.
dnl This is because we don't know yet whether pthread is enabled.
AC_ARG_ENABLE(mingw32,
[ --enable-mingw32 Use mingw32 for a Windows GUI],
[if test $enableval = yes; then
target_type=mingw
fi])
if test $target_type = mingw; then
WIN_ONLY=
CFLAGS="$CFLAGS -DWINVER=0x501"
SPECIAL_CFLAGS="-mwindows -mno-cygwin"
PTHREAD_LIB=-lpthreadGC
echo "Using mingw32 (Win32 GUI)"
else
WIN_ONLY=#
if test $target_type = cygwin; then
SPECIAL_CFLAGS="-mno-win32"
PTHREAD_LIB=
echo "Using Cygnus (Win32 command line)"
else
SPECIAL_CFLAGS=
PTHREAD_LIB=-lpthread
fi
fi
AC_SUBST(WIN_ONLY)
if test $dodk != no; then
AC_CHECK_PROGS(W3M, w3m, false)
if test "$W3M" = false; then
AC_MSG_WARN(You need w3m to build text documentation.)
fi
if test $DB2HTML = false; then
dnl We need to clean the variable, otherwise AC_CHECK_PROGS
dnl will fail
DB2HTML=""
AC_CHECK_PROGS(DB2HTML,db2html docbook2html,false)
fi
fi
AC_SUBST(W3M)
AC_SUBST(DB2HTML)
dnl If we use rpm, we need to check where %_topdir is
AC_CHECK_PROGS(RPMBIN,rpm,false)
if test $RPMBIN != false; then
RPM_BASE=`rpm --eval "%{_topdir}"`
if test "$RPM_BASE" = ""; then
RPM_BASE=/usr/src/redhat
fi
fi
AC_SUBST(RPM_BASE)
dnl Check for jade, so we can build the documentation
AC_CHECK_PROGS(JADEBIN,jade openjade,false)
AC_SUBST(JADEBIN)
dnl Check for man2html for docs.
AC_CHECK_PROGS(MAN2HTML,man2html,false)
AC_SUBST(MAN2HTML)
dnl Set doc status flag for conditional content inclusions
DOC_STATUS=p-not-stable
if test $CODE_STATUS = stable; then
DOC_STATUS="p-stable"
fi
AC_SUBST(DOC_STATUS)
dnl Checking for the docbook.dsl stylesheet file
dnl It is still not portable (directory slash)
JADECAT=""
if test $dodk = yes; then
if test $DKPREFIX = none; then
for i in /usr/share/sgml/docbook/dsssl-stylesheets \
/usr/share/sgml/docbkdsl /usr/share/sgml/docbook-dsssl \
/usr/local/share/sgml/docbook/dsssl/modular \
/usr/share/sgml/docbook/stylesheet/dsssl/modular/ \
; do
dnl echo -n does not fly with /bin/sh.
dnl echo -n "checking for $i/html/docbook.dsl..."
AC_MSG_CHECKING([for $i])
if test -f $i/html/docbook.dsl; then
echo "yes"
DKPREFIX=$i
break
else
echo "no"
fi
done
# where are the catalogs?
for i in /usr/share/sgml/CATALOG.docbk30 \
/usr/share/sgml/CATALOG.docbk31 \
/usr/share/sgml/CATALOG.docbk31 \
/usr/local/share/sgml/docbook/3.0/docbook.cat \
/usr/local/share/sgml/docbook/3.1/docbook.cat \
/usr/share/sgml/docbook/dtd/3.1/docbook.cat \
; do
dnl echo -n "checking for $i..."
AC_MSG_CHECKING([for $i])
if test -f $i; then
echo "yes"
JADECAT="$JADECAT -c $i"
else
echo "no"
fi
done
fi
fi
AC_SUBST(JADECAT)
AC_SUBST(DKPREFIX)
AC_ARG_ENABLE(large-file-support,
[ --enable-large-file-support Define _LARGE_FILES and friends.
Required by some systems to support files larger then 2GB.],
[if test $enableval = yes; then
CFLAGS="$CFLAGS -D_FILE_OFFSET_BITS=64 -D_LARGE_FILES -D_LARGEFILE_SOURCE=1"
fi])
dnl Save old CFLAGS so we can restore them later, then add SPECIAL_CFLAGS
old_CFLAGS_nospecial=$CFLAGS
CFLAGS="$CFLAGS $SPECIAL_CFLAGS"
# Hack to force AutoConf to use the CFLAGS we just set
dnl Warning: This may break with a future version of Autoconf
dnl Tested with autoconf 2.13
ac_cpp='$CPP $CPPFLAGS $SPECIAL_CFLAGS'
ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
dnl =================================================================
dnl Thread support
dnl =================================================================
AC_CHECK_HEADER(pthread.h, [have_pthread=yes], [have_pthread=no])
AC_ARG_ENABLE(pthread,
[ --disable-pthread Don't use POSIX threads (pthreads)],
[if test $enableval = no; then
# Disable pthreads
if test $have_pthread = yes; then
AC_MSG_WARN([pthreads seem to be available but you are using --disable-pthread.])
AC_MSG_WARN([This is almost always a mistake and can render Privoxy unacceptable slow.])
fi
have_pthread=no
fi])
if test $have_pthread = yes; then
PTHREAD_ONLY=
AC_DEFINE(FEATURE_PTHREAD)
echo Using POSIX threads
if test "$GCC" = "yes"; then
# Set a GCC specific switch:
if test "$target_type" = "unix"; then
ac_jgf_save_CFLAGS=$CFLAGS
CFLAGS="$CFLAGS -pthread"
AC_TRY_LINK([#include ],
[void *p = pthread_create;],
[
# This compiler switch makes GCC on Linux thread-safe
# However, it's not supported on most other OS.
PTHREAD_LIB=
SPECIAL_CFLAGS="-pthread"
])
CFLAGS=$ac_jgf_save_CFLAGS
fi
fi
else
PTHREAD_ONLY=#
echo Using native threads
fi
AC_SUBST(PTHREAD_ONLY)
dnl =================================================================
dnl Support for thread-safe versions of gethostbyaddr, gethostbyname,
dnl gmtime and localtime
dnl =================================================================
dnl Next line needed to find the gethost*_r functions on Solaris
AC_CHECK_LIB(nsl, gethostbyname)
AC_CHECK_FUNC(gethostbyaddr_r, [
AC_MSG_CHECKING([signature of gethostbyaddr_r])
AC_TRY_COMPILE([
# include
], [
struct hostent *h, *hp;
char *a, *b;
int l, bl, t, e;
(void) gethostbyaddr_r(a, l, t, h, b, bl, &hp, &e)
], [
AC_DEFINE(HAVE_GETHOSTBYADDR_R_8_ARGS)
AC_MSG_RESULT([8 args])
], [
AC_TRY_COMPILE([
# include
], [
struct hostent *h;
char *a, *b;
int l, bl, t, e;
(void) gethostbyaddr_r(a, l, t, h, b, bl, &e)
], [
AC_DEFINE(HAVE_GETHOSTBYADDR_R_7_ARGS)
AC_MSG_RESULT([7 args])
], [
AC_TRY_COMPILE([
# include
], [
struct hostent_data *d;
struct hostent *h;
char a,
int l, t;
(void) gethostbyaddr_r(a, l, t, h, d)
], [
AC_DEFINE(HAVE_GETHOSTBYADDR_R_5_ARGS)
AC_MSG_RESULT([5 args])
], [
AC_MSG_RESULT(unrecognised)
])
])
])
], [
AC_MSG_RESULT(no)
])
AC_CHECK_FUNC(gethostbyname_r, [
AC_MSG_CHECKING([signature of gethostbyname_r])
AC_TRY_COMPILE([
# include
], [
struct hostent *h, *r;
char *n, *b;
int bl, e;
(void) gethostbyname_r(n, h, b, bl, &r, &e)
], [
AC_DEFINE(HAVE_GETHOSTBYNAME_R_6_ARGS)
AC_MSG_RESULT([6 args])
], [
AC_TRY_COMPILE([
# include
], [
struct hostent *h;
char *n, *b;
int bl, e;
(void) gethostbyname_r(n, h, b, bl, &e)
], [
AC_DEFINE(HAVE_GETHOSTBYNAME_R_5_ARGS)
AC_MSG_RESULT([5 args])
], [
AC_TRY_COMPILE([
# include
], [
struct hostent_data *d;
struct hostent *h;
char *n,
(void) gethostbyname_r(n, h, d)
], [
AC_DEFINE(HAVE_GETHOSTBYNAME_R_3_ARGS)
AC_MSG_RESULT([3 args])
], [
AC_MSG_RESULT(unrecognised)
])
])
])
], [
AC_MSG_RESULT(no)
])
AC_CHECK_FUNC(gmtime_r, [
AC_MSG_CHECKING([signature of gmtime_r])
AC_TRY_COMPILE([
# include
], [
struct time *t;
struct tm *tm;
(void) gmtime_r(t, tm)
], [
AC_MSG_RESULT(ok)
AC_DEFINE(HAVE_GMTIME_R)
], [
AC_MSG_RESULT(unrecognised)
])
], [
AC_MSG_RESULT(no)
])
AC_CHECK_FUNC(localtime_r, [
AC_MSG_CHECKING([signature of localtime_r])
AC_TRY_COMPILE([
# include
], [
struct time *t;
struct tm *tm;
(void) localtime_r(t, tm)
], [
AC_MSG_RESULT(ok)
AC_DEFINE(HAVE_LOCALTIME_R)
], [
AC_MSG_RESULT(unrecognised)
])
], [
AC_MSG_RESULT(no)
])
dnl =================================================================
dnl Solaris specific
dnl FIXME: Not tested on Solaris yet...
dnl ISFIXED: Have tested it on Solaris, but there are other ways to
dnl make these checks generic, e.g.:
dnl AC_CHECK_FUNC(getsockopt, , AC_CHECK_LIB(socket, getsockopt))
dnl (Moritz Barsnick )
dnl =================================================================
SOCKET_LIB=
case "$host" in
*-solaris*) SOCKET_LIB="-lsocket -lnsl"
AC_DEFINE(__EXTENSIONS__)
if test "$GCC" = "yes"; then
# Set a GCC specific switch:
# This compiler switch makes Solaris thread-safe
PTHREAD_LIB=
SPECIAL_CFLAGS="-pthreads"
else
# What do we do without GCC? Guess this:
SPECIAL_CFLAGS="-D_REENTRANT"
fi
;;
esac
AC_SUBST(SOCKET_LIB)
dnl =================================================================
dnl Solaris problem, and others perhaps (socklen_t is undefined)
dnl =================================================================
AC_MSG_CHECKING([for socklen_t])
AC_EGREP_HEADER(socklen_t, sys/socket.h, AC_MSG_RESULT([yes]),
AC_MSG_RESULT([no])
AC_DEFINE(socklen_t,int,
[ Define to 'int' if doesn't have it. ]))
dnl =================================================================
dnl OS/2 specific
dnl =================================================================
case "$host" in
*-os2-emx*) SOCKET_LIB=-lsocket
;;
esac
AC_SUBST(SOCKET_LIB)
dnl =================================================================
dnl Mac OSX specific
dnl =================================================================
case "$host" in
*-apple-darwin*) SPECIAL_CFLAGS="-Dunix"
;;
esac
dnl =================================================================
dnl OpenBSD specific
dnl =================================================================
case "$host" in
*-openbsd*) SPECIAL_CFLAGS="$SPECIAL_CFLAGS -Dunix"
;;
esac
dnl =================================================================
dnl AmigaOS specific
dnl =================================================================
AMIGAOS_ONLY=#
case "$host" in
*-amigaos) AMIGAOS_ONLY=
;;
esac
AC_SUBST(AMIGAOS_ONLY)
dnl =================================================================
dnl Haiku specific
dnl =================================================================
if test "$host_os" = haiku; then
# Omit the "-pthread" flag to gcc, even when building with gcc 2.95
SPECIAL_CFLAGS=
# Haiku's pthreads implementation exists in its system library,
# libroot, not in a separate pthreads library
PTHREAD_LIB=
# Networking code exists in libnetwork
SOCKET_LIB=-lnetwork
# Search Haiku's common-library folder to find its pcre and
# pcreposix libraries
LIBS="-L/boot/common/lib $LIBS"
fi
dnl =================================================================
dnl Check for standard compiler stuff
dnl =================================================================
AC_EXEEXT
AC_OBJEXT
AC_HEADER_STDC
AC_HEADER_DIRENT
AC_C_CONST
AC_TYPE_SIZE_T
AC_TYPE_PID_T
AC_HEADER_TIME
AC_STRUCT_TM
AC_CHECK_SIZEOF(int, 4)
AC_CHECK_SIZEOF(char *, 4)
AC_CHECK_SIZEOF(long, 4)
AC_CHECK_SIZEOF(long long, 8)
AC_CHECK_SIZEOF(size_t, 4)
dnl Checks for header files.
AC_CHECK_HEADERS([ \
OS.h \
arpa/inet.h \
errno.h \
fcntl.h \
limits.h \
locale.h \
netdb.h \
netinet/in.h \
stddef.h \
stdlib.h \
string.h \
sys/ioctl.h \
sys/socket.h \
sys/time.h \
sys/timeb.h \
sys/wait.h \
unistd.h \
])
dnl Checks for library functions.
dnl bcopy is for PCRE
AC_CHECK_FUNCS([bcopy])
AC_PROG_GCC_TRADITIONAL
AC_TYPE_SIGNAL
AC_CHECK_FUNCS([ \
access \
atexit \
getcwd \
gethostbyaddr \
gethostbyaddr_r \
gethostbyname \
gethostbyname_r \
gettimeofday \
inet_ntoa \
localtime_r \
memchr \
memmove \
memset \
poll \
putenv \
random \
regcomp \
select \
setlocale \
shutdown \
snprintf \
socket \
strchr \
strdup \
strerror \
strftime \
strlcat \
strlcpy \
strptime \
strtoul \
timegm \
tzset \
])
dnl Checks for RFC 2553 resolver and socket functions
AC_ARG_ENABLE(ipv6-support,
[ --disable-ipv6-support Disable IPv6 support and other RFC-2554-related improvements],
[if test $enableval = yes; then
enable_ipv6_support=yes
fi], enable_ipv6_support=yes)
if test $enable_ipv6_support != yes; then
AC_MSG_WARN([Skipping checks for IPv6 support and other RFC-2554-related improvements.
Due to lock contention, this may result in slower DNS resolution for IPv4 setups, too.])
elif test $target_type = mingw; then
AC_CHECK_LIB(ws2_32, main)
AC_MSG_CHECKING(getaddrinfo in ws2_32)
AC_TRY_LINK(
[
#include
#include
],
[getaddrinfo(0,0,0,0)],
have_ws2_32_getaddrinfo=yes
)
AC_MSG_RESULT($have_ws2_32_getaddrinfo)
AC_MSG_CHECKING(getnameinfo in ws2_32)
AC_TRY_LINK(
[
#include
#include
],
[getnameinfo(0,0,0,0,0,0,0)],
have_ws2_32_getnameinfo=yes
)
AC_MSG_RESULT($have_ws2_32_getnameinfo)
if test $have_ws2_32_getaddrinfo ; then
if test $have_ws2_32_getnameinfo ; then
AC_DEFINE([HAVE_RFC2553], [1],
[Define if RFC 2553 resolver functions like getaddrinfo(3) and
getnameinfo(3) present])
fi
fi
else
AC_CHECK_FUNC([getaddrinfo],
[AC_CHECK_FUNC([getnameinfo],
[AC_DEFINE([HAVE_RFC2553], [1],
[Define if RFC 2553 resolver functions like getaddrinfo(3) and
getnameinfo(3) present])
])
])
fi
dnl =================================================================
dnl Checks for libraries.
dnl =================================================================
dnl Note: Some systems may have the library but not the system header
dnl file, so we must check for both.
dnl Also check for correct version
AC_CHECK_LIB(pcre, pcre_compile, [
AC_CHECK_HEADER(pcre.h, [
AC_EGREP_HEADER(pcre_fullinfo, pcre.h, [have_pcre=yes], [AC_MSG_WARN([[pcre old version installed]]); have_pcre=no])
], [
AC_CHECK_HEADER(pcre/pcre.h, [
AC_EGREP_HEADER(pcre_fullinfo, pcre/pcre.h, [have_pcre=yes]; [AC_DEFINE(PCRE_H_IN_SUBDIR)], [AC_MSG_WARN([[pcre old version installed]]); have_pcre=no])
], [have_pcre=no])
])
], [have_pcre=no])
AC_CHECK_LIB(pcreposix, regcomp, [
AC_CHECK_HEADER(pcreposix.h, [
AC_EGREP_HEADER(pcreposix_regerror, pcreposix.h, [AC_MSG_WARN([[pcreposix old version installed]]); have_pcreposix=no], [have_pcreposix=yes])
], [
AC_CHECK_HEADER(pcre/pcreposix.h, [
AC_EGREP_HEADER(pcreposix_regerror, pcre/pcreposix.h, [AC_MSG_WARN([[pcreposix old version installed]]); have_pcreposix=no], [have_pcreposix=yes]; [AC_DEFINE(PCREPOSIX_H_IN_SUBDIR)])
], [have_pcreposix=no])
])
], [have_pcreposix=no], -lpcre)
dnl ================================================================
dnl libpcrs is temporarily disabled.
dnl
dnl Privoxy's own pcrs version fixes some problems that
dnl are present in libpcrs 0.3, the last pcrs release we
dnl know of, and as libpcrs seems to be currently unmaintained
dnl we can't send these fixes upstream.
dnl ================================================================
dnl
dnl AC_CHECK_LIB(pcrs, pcrs_compile, [AC_CHECK_HEADER(pcrs.h, [have_pcrs=yes], [have_pcrs=no])], [have_pcrs=no], -lpcre)
dnl =================================================================
dnl Always defined
dnl =================================================================
AC_DEFINE(__MT__)
dnl =================================================================
dnl Features
dnl =================================================================
AC_ARG_ENABLE(toggle,
[ --disable-toggle Don't support temporary disable],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_TOGGLE)
fi],AC_DEFINE(FEATURE_TOGGLE))
AC_ARG_ENABLE(force,
[ --disable-force Don't allow single-page disable],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_FORCE_LOAD)
fi],AC_DEFINE(FEATURE_FORCE_LOAD))
AC_ARG_ENABLE(fast-redirects,
[ --disable-fast-redirects Don't support fast redirects],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_FAST_REDIRECTS)
fi], AC_DEFINE(FEATURE_FAST_REDIRECTS))
AC_ARG_ENABLE(stats,
[ --disable-stats Don't keep statistics],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_STATISTICS)
fi],AC_DEFINE(FEATURE_STATISTICS))
AC_ARG_ENABLE(ie-images,
[ --enable-ie-images Enable a quick but not always reliable auto-detect whether requests from
MS Internet Explorer are for an image or not.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_IMAGE_DETECT_MSIE)
fi],)
AC_ARG_ENABLE(image-blocking,
[ --disable-image-blocking Don't try to figure out whether a request is
for an image or HTML - assume HTML.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_IMAGE_BLOCKING)
fi],
AC_DEFINE(FEATURE_IMAGE_BLOCKING))
AC_ARG_ENABLE(acl-support,
[ --disable-acl-support Prevents the use of ACLs to control access to
Privoxy by IP address.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_ACL)
fi],
AC_DEFINE(FEATURE_ACL))
AC_ARG_ENABLE(trust-files,
[ --disable-trust-files Prevents the use of trust files.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_TRUST)
fi],
AC_DEFINE(FEATURE_TRUST))
AC_ARG_ENABLE(editor,
[ --disable-editor Prevents the use of the web-based actions file
editor and web-based temporary disable setting.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_CGI_EDIT_ACTIONS)
fi],
AC_DEFINE(FEATURE_CGI_EDIT_ACTIONS))
AC_ARG_ENABLE(no-gifs,
[ --enable-no-gifs Use politically correct PNG format instead of GIF
for built-in images. May not work with all browsers.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_NO_GIFS)
fi])
AC_ARG_ENABLE(graceful-termination,
[ --enable-graceful-termination Allow to shutdown Privoxy through the webinterface.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_GRACEFUL_TERMINATION)
fi])
AC_ARG_ENABLE(extended-host-patterns,
[ --enable-extended-host-patterns Enable and require PCRE syntax in host patterns. This feature hasn't
been announced yet and it's not clear if it's a good idea. It's expected
to work, but undocumented. You should only enable it if you know what
PCRE is and are sure that you need it for your host patterns. You can
use tools/url-pattern-translator.pl to convert existing action files to
use PCRE host patterns. Please don't enable this option when creating
packages for others that may not be expecting it.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_EXTENDED_HOST_PATTERNS)
fi])
AC_ARG_ENABLE(accept-filter,
[ --enable-accept-filter Try to use accf_http(9) if supported.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_ACCEPT_FILTER)
fi])
AC_ARG_ENABLE(strptime-sanity-checks,
[ --enable-strptime-sanity-checks Only trust strptime() results if an additional strftime()/strptime()
conversion doesn't change the result. Can be useful if strptime() is
known or suspected to be broken.],
[if test $enableval = yes; then
AC_DEFINE(FEATURE_STRPTIME_SANITY_CHECKS)
fi])
dnl pcre/pcrs is needed for CGI anyway, so
dnl the choice is only between static and
dnl dynamic:
AC_ARG_ENABLE(dynamic-pcre,
[ --disable-dynamic-pcre Use the built-in, static pcre, even if libpcre is available],
[ if test $enableval = "no"; then have_pcre=no; fi ])
dnl =================================================
dnl libpcrs is temporarily disabled,
dnl see comment above for the reason.
dnl =================================================
dnl AC_ARG_ENABLE(dynamic-pcrs,
dnl [ --disable-dynamic-pcrs Use the built-in, static pcrs, even if libpcrs is available],
dnl [ if test $enableval = "no"; then have_pcrs=no; fi ])
dnl ====================================================
dnl This check is incomplete. For mingw32 zlib is found
dnl by configure, but not necessarily by the compiler.
dnl ====================================================
AC_ARG_ENABLE(zlib,
[ --disable-zlib Don't use zlib to decompress data before filtering.],
[enableval2=$enableval],
[enableval2=yes])
if test $enableval2 = yes; then
AC_CHECK_LIB(z, zlibVersion, [have_zlib="yes"], [have_zlib="no"])
if test $have_zlib = "yes"; then
LIBS="$LIBS -lz"
AC_DEFINE(FEATURE_ZLIB,1,[Define to 1 to use zlib to decompress data before filtering.])
else
AC_MSG_WARN([No zlib found.
Privoxy will not be able to filter compressed content.
This may become a fatal error in the future.])
fi
fi
AC_ARG_ENABLE(compression,
[ --enable-compression Allow Privoxy to compress buffered content if the client supports it. Requires zlib support.],
[enableval2=$enableval],
[enableval2=no])
if test $enableval2 = yes; then
if test $have_zlib = "yes"; then
echo Enabling compression support.
AC_DEFINE(FEATURE_COMPRESSION,1,[Define to 1 to use compression through the zlib library.])
else
AC_MSG_WARN([No zlib found. Privoxy will not be able to (re-)compressed buffered content.])
fi
fi
# If we have libpcre and either we also have pcreposix or
# we don't need pcreposix, then link pcre dynamically; else
# build it and link statically
#
if test $have_pcre = "yes"; then
echo "using libpcre"
pcre_dyn=yes
STATIC_PCRE_ONLY=#
LIBS="$LIBS -lpcre -lpcreposix"
else
AC_MSG_WARN([You are using the static PCRE code which is out of date and scheduled for removal, for details see:
http://sourceforge.net/mailarchive/forum.php?thread_name=20080511195555.2dc6cfdc%40fabiankeil.de&forum_name=ijbswa-developers])
pcre_dyn=no
AC_DEFINE(STATIC_PCRE)
STATIC_PCRE_ONLY=
fi
AC_DEFINE(FEATURE_CONNECTION_KEEP_ALIVE)
if test $have_pthread = "yes" -o $target_type = "mingw"; then
echo Enabling connection-sharing support.
AC_DEFINE(FEATURE_CONNECTION_SHARING)
fi
dnl =================================================
dnl libpcrs is temporarily disabled,
dnl see comment above for the reason.
dnl =================================================
dnl # If we have libpcrs and pcre is linked dynamically
dnl # then also link pcrs dynamically, else build and link
dnl # pcrs statically
dnl
dnl if test $have_pcrs = "yes" -a $pcre_dyn = "yes"; then
dnl echo "using libpcrs"
dnl STATIC_PCRS_ONLY=#
dnl LIBS="$LIBS -lpcrs"
dnl else
dnl echo "using built-in static pcrs"
AC_DEFINE(STATIC_PCRS)
STATIC_PCRS_ONLY=
dnl fi
AC_SUBST(STATIC_PCRE_ONLY)
AC_SUBST(STATIC_PCRS_ONLY)
dnl =================================================================
dnl Final cleanup and output
dnl =================================================================
dnl Remove the SPECIAL_CFLAGS stuff from CFLAGS, and add it separately
dnl in the Makefile
CFLAGS=$old_CFLAGS_nospecial
AC_SUBST(SPECIAL_CFLAGS)
AC_SUBST(PTHREAD_LIB)
AC_OUTPUT(GNUmakefile doc/source/ldp.dsl)
privoxy-3.0.21-stable/./config 000640 001751 001751 00000170545 12116120047 015160 0 ustar 00fk fk 000000 000000 # Sample Configuration File for Privoxy 3.0.21
#
# $Id: config,v 1.104 2013/03/07 14:11:51 fabiankeil Exp $
#
# Copyright (C) 2001-2013 Privoxy Developers http://www.privoxy.org/
#
####################################################################
# #
# Table of Contents #
# #
# I. INTRODUCTION #
# II. FORMAT OF THE CONFIGURATION FILE #
# #
# 1. LOCAL SET-UP DOCUMENTATION #
# 2. CONFIGURATION AND LOG FILE LOCATIONS #
# 3. DEBUGGING #
# 4. ACCESS CONTROL AND SECURITY #
# 5. FORWARDING #
# 6. MISCELLANEOUS #
# 7. WINDOWS GUI OPTIONS #
# #
####################################################################
#
#
# I. INTRODUCTION
# ===============
#
# This file holds Privoxy's main configuration. Privoxy detects
# configuration changes automatically, so you don't have to restart
# it unless you want to load a different configuration file.
#
# The configuration will be reloaded with the first request after
# the change was done, this request itself will still use the old
# configuration, though. In other words: it takes two requests
# before you see the result of your changes. Requests that are
# dropped due to ACL don't trigger reloads.
#
# When starting Privoxy on Unix systems, give the location of this
# file as last argument. On Windows systems, Privoxy will look for
# this file with the name 'config.txt' in the current working
# directory of the Privoxy process.
#
#
# II. FORMAT OF THE CONFIGURATION FILE
# ====================================
#
# Configuration lines consist of an initial keyword followed by a
# list of values, all separated by whitespace (any number of spaces
# or tabs). For example,
#
# actionsfile default.action
#
# Indicates that the actionsfile is named 'default.action'.
#
# The '#' indicates a comment. Any part of a line following a '#' is
# ignored, except if the '#' is preceded by a '\'.
#
# Thus, by placing a # at the start of an existing configuration
# line, you can make it a comment and it will be treated as if it
# weren't there. This is called "commenting out" an option and can
# be useful. Removing the # again is called "uncommenting".
#
# Note that commenting out an option and leaving it at its default
# are two completely different things! Most options behave very
# differently when unset. See the "Effect if unset" explanation in
# each option's description for details.
#
# Long lines can be continued on the next line by using a `\' as the
# last character.
#
#
# 1. LOCAL SET-UP DOCUMENTATION
# ==============================
#
# If you intend to operate Privoxy for more users than just
# yourself, it might be a good idea to let them know how to reach
# you, what you block and why you do that, your policies, etc.
#
#
# 1.1. user-manual
# =================
#
# Specifies:
#
# Location of the Privoxy User Manual.
#
# Type of value:
#
# A fully qualified URI
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# http://www.privoxy.org/version/user-manual/ will be used,
# where version is the Privoxy version.
#
# Notes:
#
# The User Manual URI is the single best source of information
# on Privoxy, and is used for help links from some of the
# internal CGI pages. The manual itself is normally packaged
# with the binary distributions, so you probably want to set
# this to a locally installed copy.
#
# Examples:
#
# The best all purpose solution is simply to put the full local
# PATH to where the User Manual is located:
#
# user-manual /usr/share/doc/privoxy/user-manual
#
# The User Manual is then available to anyone with access to
# Privoxy, by following the built-in URL: http://
# config.privoxy.org/user-manual/ (or the shortcut: http://p.p/
# user-manual/).
#
# If the documentation is not on the local system, it can be
# accessed from a remote server, as:
#
# user-manual http://example.com/privoxy/user-manual/
#
# WARNING!!!
#
# If set, this option should be the first option in the
# config file, because it is used while the config file is
# being read.
#
#user-manual http://www.privoxy.org/user-manual/
#
# 1.2. trust-info-url
# ====================
#
# Specifies:
#
# A URL to be displayed in the error page that users will see if
# access to an untrusted page is denied.
#
# Type of value:
#
# URL
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# No links are displayed on the "untrusted" error page.
#
# Notes:
#
# The value of this option only matters if the experimental
# trust mechanism has been activated. (See trustfile below.)
#
# If you use the trust mechanism, it is a good idea to write up
# some on-line documentation about your trust policy and to
# specify the URL(s) here. Use multiple times for multiple URLs.
#
# The URL(s) should be added to the trustfile as well, so users
# don't end up locked out from the information on why they were
# locked out in the first place!
#
#trust-info-url http://www.example.com/why_we_block.html
#trust-info-url http://www.example.com/what_we_allow.html
#
# 1.3. admin-address
# ===================
#
# Specifies:
#
# An email address to reach the Privoxy administrator.
#
# Type of value:
#
# Email address
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# No email address is displayed on error pages and the CGI user
# interface.
#
# Notes:
#
# If both admin-address and proxy-info-url are unset, the whole
# "Local Privoxy Support" box on all generated pages will not be
# shown.
#
#admin-address privoxy-admin@example.com
#
# 1.4. proxy-info-url
# ====================
#
# Specifies:
#
# A URL to documentation about the local Privoxy setup,
# configuration or policies.
#
# Type of value:
#
# URL
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# No link to local documentation is displayed on error pages and
# the CGI user interface.
#
# Notes:
#
# If both admin-address and proxy-info-url are unset, the whole
# "Local Privoxy Support" box on all generated pages will not be
# shown.
#
# This URL shouldn't be blocked ;-)
#
#proxy-info-url http://www.example.com/proxy-service.html
#
# 2. CONFIGURATION AND LOG FILE LOCATIONS
# ========================================
#
# Privoxy can (and normally does) use a number of other files for
# additional configuration, help and logging. This section of the
# configuration file tells Privoxy where to find those other files.
#
# The user running Privoxy, must have read permission for all
# configuration files, and write permission to any files that would
# be modified, such as log files and actions files.
#
#
# 2.1. confdir
# =============
#
# Specifies:
#
# The directory where the other configuration files are located.
#
# Type of value:
#
# Path name
#
# Default value:
#
# /etc/privoxy (Unix) or Privoxy installation dir (Windows)
#
# Effect if unset:
#
# Mandatory
#
# Notes:
#
# No trailing "/", please.
#
confdir .
#
# 2.2. templdir
# ==============
#
# Specifies:
#
# An alternative directory where the templates are loaded from.
#
# Type of value:
#
# Path name
#
# Default value:
#
# unset
#
# Effect if unset:
#
# The templates are assumed to be located in confdir/template.
#
# Notes:
#
# Privoxy's original templates are usually overwritten with each
# update. Use this option to relocate customized templates that
# should be kept. As template variables might change between
# updates, you shouldn't expect templates to work with Privoxy
# releases other than the one they were part of, though.
#
#templdir .
#
# 2.3. logdir
# ============
#
# Specifies:
#
# The directory where all logging takes place (i.e. where the
# logfile is located).
#
# Type of value:
#
# Path name
#
# Default value:
#
# /var/log/privoxy (Unix) or Privoxy installation dir (Windows)
#
# Effect if unset:
#
# Mandatory
#
# Notes:
#
# No trailing "/", please.
#
logdir .
#
# 2.4. actionsfile
# =================
#
# Specifies:
#
# The actions file(s) to use
#
# Type of value:
#
# Complete file name, relative to confdir
#
# Default values:
#
# match-all.action # Actions that are applied to all sites and maybe overruled later on.
#
# default.action # Main actions file
#
# user.action # User customizations
#
# Effect if unset:
#
# No actions are taken at all. More or less neutral proxying.
#
# Notes:
#
# Multiple actionsfile lines are permitted, and are in fact
# recommended!
#
# The default values are default.action, which is the "main"
# actions file maintained by the developers, and user.action,
# where you can make your personal additions.
#
# Actions files contain all the per site and per URL
# configuration for ad blocking, cookie management, privacy
# considerations, etc. There is no point in using Privoxy
# without at least one actions file.
#
# Note that since Privoxy 3.0.7, the complete filename,
# including the ".action" extension has to be specified. The
# syntax change was necessary to be consistent with the other
# file options and to allow previously forbidden characters.
#
actionsfile match-all.action # Actions that are applied to all sites and maybe overruled later on.
actionsfile default.action # Main actions file
actionsfile user.action # User customizations
#
# 2.5. filterfile
# ================
#
# Specifies:
#
# The filter file(s) to use
#
# Type of value:
#
# File name, relative to confdir
#
# Default value:
#
# default.filter (Unix) or default.filter.txt (Windows)
#
# Effect if unset:
#
# No textual content filtering takes place, i.e. all +filter{name}
# actions in the actions files are turned neutral.
#
# Notes:
#
# Multiple filterfile lines are permitted.
#
# The filter files contain content modification rules that use
# regular expressions. These rules permit powerful changes on
# the content of Web pages, and optionally the headers as well,
# e.g., you could try to disable your favorite JavaScript
# annoyances, re-write the actual displayed text, or just have
# some fun playing buzzword bingo with web pages.
#
# The +filter{name} actions rely on the relevant filter (name)
# to be defined in a filter file!
#
# A pre-defined filter file called default.filter that contains
# a number of useful filters for common problems is included in
# the distribution. See the section on the filter action for a
# list.
#
# It is recommended to place any locally adapted filters into a
# separate file, such as user.filter.
#
filterfile default.filter
filterfile user.filter # User customizations
#
# 2.6. logfile
# =============
#
# Specifies:
#
# The log file to use
#
# Type of value:
#
# File name, relative to logdir
#
# Default value:
#
# Unset (commented out). When activated: logfile (Unix) or
# privoxy.log (Windows).
#
# Effect if unset:
#
# No logfile is written.
#
# Notes:
#
# The logfile is where all logging and error messages are
# written. The level of detail and number of messages are set
# with the debug option (see below). The logfile can be useful
# for tracking down a problem with Privoxy (e.g., it's not
# blocking an ad you think it should block) and it can help you
# to monitor what your browser is doing.
#
# Depending on the debug options below, the logfile may be a
# privacy risk if third parties can get access to it. As most
# users will never look at it, Privoxy 3.0.7 and later only log
# fatal errors by default.
#
# For most troubleshooting purposes, you will have to change
# that, please refer to the debugging section for details.
#
# Your logfile will grow indefinitely, and you will probably
# want to periodically remove it. On Unix systems, you can do
# this with a cron job (see "man cron").
#
# Any log files must be writable by whatever user Privoxy is
# being run as (on Unix, default user id is "privoxy").
#
logfile logfile
#
# 2.7. trustfile
# ===============
#
# Specifies:
#
# The name of the trust file to use
#
# Type of value:
#
# File name, relative to confdir
#
# Default value:
#
# Unset (commented out). When activated: trust (Unix) or
# trust.txt (Windows)
#
# Effect if unset:
#
# The entire trust mechanism is disabled.
#
# Notes:
#
# The trust mechanism is an experimental feature for building
# white-lists and should be used with care. It is NOT
# recommended for the casual user.
#
# If you specify a trust file, Privoxy will only allow access to
# sites that are specified in the trustfile. Sites can be listed
# in one of two ways:
#
# Prepending a ~ character limits access to this site only (and
# any sub-paths within this site), e.g. ~www.example.com allows
# access to ~www.example.com/features/news.html, etc.
#
# Or, you can designate sites as trusted referrers, by
# prepending the name with a + character. The effect is that
# access to untrusted sites will be granted -- but only if a
# link from this trusted referrer was used to get there. The
# link target will then be added to the "trustfile" so that
# future, direct accesses will be granted. Sites added via this
# mechanism do not become trusted referrers themselves (i.e.
# they are added with a ~ designation). There is a limit of 512
# such entries, after which new entries will not be made.
#
# If you use the + operator in the trust file, it may grow
# considerably over time.
#
# It is recommended that Privoxy be compiled with the
# --disable-force, --disable-toggle and --disable-editor
# options, if this feature is to be used.
#
# Possible applications include limiting Internet access for
# children.
#
#trustfile trust
#
# 3. DEBUGGING
# =============
#
# These options are mainly useful when tracing a problem. Note that
# you might also want to invoke Privoxy with the --no-daemon command
# line option when debugging.
#
#
# 3.1. debug
# ===========
#
# Specifies:
#
# Key values that determine what information gets logged.
#
# Type of value:
#
# Integer values
#
# Default value:
#
# 0 (i.e.: only fatal errors (that cause Privoxy to exit) are
# logged)
#
# Effect if unset:
#
# Default value is used (see above).
#
# Notes:
#
# The available debug levels are:
#
# debug 1 # Log the destination for each request Privoxy let through. See also debug 1024.
# debug 2 # show each connection status
# debug 4 # show I/O status
# debug 8 # show header parsing
# debug 16 # log all data written to the network
# debug 32 # debug force feature
# debug 64 # debug regular expression filters
# debug 128 # debug redirects
# debug 256 # debug GIF de-animation
# debug 512 # Common Log Format
# debug 1024 # Log the destination for requests Privoxy didn't let through, and the reason why.
# debug 2048 # CGI user interface
# debug 4096 # Startup banner and warnings.
# debug 8192 # Non-fatal errors
# debug 32768 # log all data read from the network
# debug 65536 # Log the applying actions
#
# To select multiple debug levels, you can either add them or
# use multiple debug lines.
#
# A debug level of 1 is informative because it will show you
# each request as it happens. 1, 1024, 4096 and 8192 are
# recommended so that you will notice when things go wrong. The
# other levels are probably only of interest if you are hunting
# down a specific problem. They can produce a hell of an output
# (especially 16).
#
# Privoxy used to ship with the debug levels recommended above
# enabled by default, but due to privacy concerns 3.0.7 and
# later are configured to only log fatal errors.
#
# If you are used to the more verbose settings, simply enable
# the debug lines below again.
#
# If you want to use pure CLF (Common Log Format), you should
# set "debug 512" ONLY and not enable anything else.
#
# Privoxy has a hard-coded limit for the length of log messages.
# If it's reached, messages are logged truncated and marked with
# "... [too long, truncated]".
#
# Please don't file any support requests without trying to
# reproduce the problem with increased debug level first. Once
# you read the log messages, you may even be able to solve the
# problem on your own.
#
#debug 1 # Log the destination for each request Privoxy let through. See also debug 1024.
#debug 1024 # Actions that are applied to all sites and maybe overruled later on.
#debug 4096 # Startup banner and warnings
#debug 8192 # Non-fatal errors
#
# 3.2. single-threaded
# =====================
#
# Specifies:
#
# Whether to run only one server thread.
#
# Type of value:
#
# None
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# Multi-threaded (or, where unavailable: forked) operation, i.e.
# the ability to serve multiple requests simultaneously.
#
# Notes:
#
# This option is only there for debugging purposes. It will
# drastically reduce performance.
#
#single-threaded
#
# 3.3. hostname
# ==============
#
# Specifies:
#
# The hostname shown on the CGI pages.
#
# Type of value:
#
# Text
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# The hostname provided by the operating system is used.
#
# Notes:
#
# On some misconfigured systems resolving the hostname fails or
# takes too much time and slows Privoxy down. Setting a fixed
# hostname works around the problem.
#
# In other circumstances it might be desirable to show a
# hostname other than the one returned by the operating system.
# For example if the system has several different hostnames and
# you don't want to use the first one.
#
# Note that Privoxy does not validate the specified hostname
# value.
#
#hostname hostname.example.org
#
# 4. ACCESS CONTROL AND SECURITY
# ===============================
#
# This section of the config file controls the security-relevant
# aspects of Privoxy's configuration.
#
#
# 4.1. listen-address
# ====================
#
# Specifies:
#
# The address and TCP port on which Privoxy will listen for
# client requests.
#
# Type of value:
#
# [IP-Address]:Port
#
# [Hostname]:Port
#
# Default value:
#
# 127.0.0.1:8118
#
# Effect if unset:
#
# Bind to 127.0.0.1 (IPv4 localhost), port 8118. This is
# suitable and recommended for home users who run Privoxy on the
# same machine as their browser.
#
# Notes:
#
# You will need to configure your browser(s) to this proxy
# address and port.
#
# If you already have another service running on port 8118, or
# if you want to serve requests from other machines (e.g. on
# your local network) as well, you will need to override the
# default.
#
# You can use this statement multiple times to make Privoxy
# listen on more ports or more IP addresses. Suitable if your
# operating system does not support sharing IPv6 and IPv4
# protocols on the same socket.
#
# If a hostname is used instead of an IP address, Privoxy will
# try to resolve it to an IP address and if there are multiple,
# use the first one returned.
#
# If the address for the hostname isn't already known on the
# system (for example because it's in /etc/hostname), this may
# result in DNS traffic.
#
# If the specified address isn't available on the system, or if
# the hostname can't be resolved, Privoxy will fail to start.
#
# IPv6 addresses containing colons have to be quoted by
# brackets. They can only be used if Privoxy has been compiled
# with IPv6 support. If you aren't sure if your version supports
# it, have a look at http://config.privoxy.org/show-status.
#
# Some operating systems will prefer IPv6 to IPv4 addresses even
# if the system has no IPv6 connectivity which is usually not
# expected by the user. Some even rely on DNS to resolve
# localhost which mean the "localhost" address used may not
# actually be local.
#
# It is therefore recommended to explicitly configure the
# intended IP address instead of relying on the operating
# system, unless there's a strong reason not to.
#
# If you leave out the address, Privoxy will bind to all IPv4
# interfaces (addresses) on your machine and may become
# reachable from the Internet and/or the local network. Be aware
# that some GNU/Linux distributions modify that behaviour
# without updating the documentation. Check for non-standard
# patches if your Privoxy version behaves differently.
#
# If you configure Privoxy to be reachable from the network,
# consider using access control lists (ACL's, see below), and/or
# a firewall.
#
# If you open Privoxy to untrusted users, you will also want to
# make sure that the following actions are disabled:
# enable-edit-actions and enable-remote-toggle
#
# Example:
#
# Suppose you are running Privoxy on a machine which has the
# address 192.168.0.1 on your local private network
# (192.168.0.0) and has another outside connection with a
# different address. You want it to serve requests from inside
# only:
#
# listen-address 192.168.0.1:8118
#
# Suppose you are running Privoxy on an IPv6-capable machine and
# you want it to listen on the IPv6 address of the loopback
# device:
#
# listen-address [::1]:8118
#
listen-address 127.0.0.1:8118
#
# 4.2. toggle
# ============
#
# Specifies:
#
# Initial state of "toggle" status
#
# Type of value:
#
# 1 or 0
#
# Default value:
#
# 1
#
# Effect if unset:
#
# Act as if toggled on
#
# Notes:
#
# If set to 0, Privoxy will start in "toggled off" mode, i.e.
# mostly behave like a normal, content-neutral proxy with both
# ad blocking and content filtering disabled. See
# enable-remote-toggle below.
#
toggle 1
#
# 4.3. enable-remote-toggle
# ==========================
#
# Specifies:
#
# Whether or not the web-based toggle feature may be used
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# The web-based toggle feature is disabled.
#
# Notes:
#
# When toggled off, Privoxy mostly acts like a normal,
# content-neutral proxy, i.e. doesn't block ads or filter
# content.
#
# Access to the toggle feature can not be controlled separately
# by "ACLs" or HTTP authentication, so that everybody who can
# access Privoxy (see "ACLs" and listen-address above) can
# toggle it for all users. So this option is not recommended for
# multi-user environments with untrusted users.
#
# Note that malicious client side code (e.g Java) is also
# capable of using this option.
#
# As a lot of Privoxy users don't read documentation, this
# feature is disabled by default.
#
# Note that you must have compiled Privoxy with support for this
# feature, otherwise this option has no effect.
#
enable-remote-toggle 0
#
# 4.4. enable-remote-http-toggle
# ===============================
#
# Specifies:
#
# Whether or not Privoxy recognizes special HTTP headers to
# change its behaviour.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Privoxy ignores special HTTP headers.
#
# Notes:
#
# When toggled on, the client can change Privoxy's behaviour by
# setting special HTTP headers. Currently the only supported
# special header is "X-Filter: No", to disable filtering for the
# ongoing request, even if it is enabled in one of the action
# files.
#
# This feature is disabled by default. If you are using Privoxy
# in a environment with trusted clients, you may enable this
# feature at your discretion. Note that malicious client side
# code (e.g Java) is also capable of using this feature.
#
# This option will be removed in future releases as it has been
# obsoleted by the more general header taggers.
#
enable-remote-http-toggle 0
#
# 4.5. enable-edit-actions
# =========================
#
# Specifies:
#
# Whether or not the web-based actions file editor may be used
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# The web-based actions file editor is disabled.
#
# Notes:
#
# Access to the editor can not be controlled separately by
# "ACLs" or HTTP authentication, so that everybody who can
# access Privoxy (see "ACLs" and listen-address above) can
# modify its configuration for all users.
#
# This option is not recommended for environments with untrusted
# users and as a lot of Privoxy users don't read documentation,
# this feature is disabled by default.
#
# Note that malicious client side code (e.g Java) is also
# capable of using the actions editor and you shouldn't enable
# this options unless you understand the consequences and are
# sure your browser is configured correctly.
#
# Note that you must have compiled Privoxy with support for this
# feature, otherwise this option has no effect.
#
enable-edit-actions 0
#
# 4.6. enforce-blocks
# ====================
#
# Specifies:
#
# Whether the user is allowed to ignore blocks and can "go there
# anyway".
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Blocks are not enforced.
#
# Notes:
#
# Privoxy is mainly used to block and filter requests as a
# service to the user, for example to block ads and other junk
# that clogs the pipes. Privoxy's configuration isn't perfect
# and sometimes innocent pages are blocked. In this situation it
# makes sense to allow the user to enforce the request and have
# Privoxy ignore the block.
#
# In the default configuration Privoxy's "Blocked" page contains
# a "go there anyway" link to adds a special string (the force
# prefix) to the request URL. If that link is used, Privoxy will
# detect the force prefix, remove it again and let the request
# pass.
#
# Of course Privoxy can also be used to enforce a network
# policy. In that case the user obviously should not be able to
# bypass any blocks, and that's what the "enforce-blocks" option
# is for. If it's enabled, Privoxy hides the "go there anyway"
# link. If the user adds the force prefix by hand, it will not
# be accepted and the circumvention attempt is logged.
#
# Examples:
#
# enforce-blocks 1
#
enforce-blocks 0
#
# 4.7. ACLs: permit-access and deny-access
# =========================================
#
# Specifies:
#
# Who can access what.
#
# Type of value:
#
# src_addr[:port][/src_masklen] [dst_addr[:port][/dst_masklen]]
#
# Where src_addr and dst_addr are IPv4 addresses in dotted
# decimal notation or valid DNS names, port is a port number,
# and src_masklen and dst_masklen are subnet masks in CIDR
# notation, i.e. integer values from 2 to 30 representing the
# length (in bits) of the network address. The masks and the
# whole destination part are optional.
#
# If your system implements RFC 3493, then src_addr and dst_addr
# can be IPv6 addresses delimeted by brackets, port can be a
# number or a service name, and src_masklen and dst_masklen can
# be a number from 0 to 128.
#
# Default value:
#
# Unset
#
# If no port is specified, any port will match. If no
# src_masklen or src_masklen is given, the complete IP address
# has to match (i.e. 32 bits for IPv4 and 128 bits for IPv6).
#
# Effect if unset:
#
# Don't restrict access further than implied by listen-address
#
# Notes:
#
# Access controls are included at the request of ISPs and
# systems administrators, and are not usually needed by
# individual users. For a typical home user, it will normally
# suffice to ensure that Privoxy only listens on the localhost
# (127.0.0.1) or internal (home) network address by means of the
# listen-address option.
#
# Please see the warnings in the FAQ that Privoxy is not
# intended to be a substitute for a firewall or to encourage
# anyone to defer addressing basic security weaknesses.
#
# Multiple ACL lines are OK. If any ACLs are specified, Privoxy
# only talks to IP addresses that match at least one
# permit-access line and don't match any subsequent deny-access
# line. In other words, the last match wins, with the default
# being deny-access.
#
# If Privoxy is using a forwarder (see forward below) for a
# particular destination URL, the dst_addr that is examined is
# the address of the forwarder and NOT the address of the
# ultimate target. This is necessary because it may be
# impossible for the local Privoxy to determine the IP address
# of the ultimate target (that's often what gateways are used
# for).
#
# You should prefer using IP addresses over DNS names, because
# the address lookups take time. All DNS names must resolve! You
# can not use domain patterns like "*.org" or partial domain
# names. If a DNS name resolves to multiple IP addresses, only
# the first one is used.
#
# Some systems allow IPv4 clients to connect to IPv6 server
# sockets. Then the client's IPv4 address will be translated by
# the system into IPv6 address space with special prefix
# ::ffff:0:0/96 (so called IPv4 mapped IPv6 address). Privoxy
# can handle it and maps such ACL addresses automatically.
#
# Denying access to particular sites by ACL may have undesired
# side effects if the site in question is hosted on a machine
# which also hosts other sites (most sites are).
#
# Examples:
#
# Explicitly define the default behavior if no ACL and
# listen-address are set: "localhost" is OK. The absence of a
# dst_addr implies that all destination addresses are OK:
#
# permit-access localhost
#
# Allow any host on the same class C subnet as www.privoxy.org
# access to nothing but www.example.com (or other domains hosted
# on the same system):
#
# permit-access www.privoxy.org/24 www.example.com/32
#
# Allow access from any host on the 26-bit subnet 192.168.45.64
# to anywhere, with the exception that 192.168.45.73 may not
# access the IP address behind www.dirty-stuff.example.com:
#
# permit-access 192.168.45.64/26
# deny-access 192.168.45.73 www.dirty-stuff.example.com
#
# Allow access from the IPv4 network 192.0.2.0/24 even if
# listening on an IPv6 wild card address (not supported on all
# platforms):
#
# permit-access 192.0.2.0/24
#
# This is equivalent to the following line even if listening on
# an IPv4 address (not supported on all platforms):
#
# permit-access [::ffff:192.0.2.0]/120
#
#
# 4.8. buffer-limit
# ==================
#
# Specifies:
#
# Maximum size of the buffer for content filtering.
#
# Type of value:
#
# Size in Kbytes
#
# Default value:
#
# 4096
#
# Effect if unset:
#
# Use a 4MB (4096 KB) limit.
#
# Notes:
#
# For content filtering, i.e. the +filter and +deanimate-gif
# actions, it is necessary that Privoxy buffers the entire
# document body. This can be potentially dangerous, since a
# server could just keep sending data indefinitely and wait for
# your RAM to exhaust -- with nasty consequences. Hence this
# option.
#
# When a document buffer size reaches the buffer-limit, it is
# flushed to the client unfiltered and no further attempt to
# filter the rest of the document is made. Remember that there
# may be multiple threads running, which might require up to
# buffer-limit Kbytes each, unless you have enabled
# "single-threaded" above.
#
buffer-limit 4096
#
# 4.9. enable-proxy-authentication-forwarding
# ============================================
#
# Specifies:
#
# Whether or not proxy authentication through Privoxy should
# work.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Proxy authentication headers are removed.
#
# Notes:
#
# Privoxy itself does not support proxy authentication, but can
# allow clients to authenticate against Privoxy's parent proxy.
#
# By default Privoxy (3.0.21 and later) don't do that and remove
# Proxy-Authorization headers in requests and Proxy-Authenticate
# headers in responses to make it harder for malicious sites to
# trick inexperienced users into providing login information.
#
# If this option is enabled the headers are forwarded.
#
# Enabling this option is not recommended if there is no parent
# proxy that requires authentication or if the local network
# between Privoxy and the parent proxy isn't trustworthy. If
# proxy authentication is only required for some requests, it is
# recommended to use a client header filter to remove the
# authentication headers for requests where they aren't needed.
#
enable-proxy-authentication-forwarding 0
#
# 5. FORWARDING
# ==============
#
# This feature allows routing of HTTP requests through a chain of
# multiple proxies.
#
# Forwarding can be used to chain Privoxy with a caching proxy to
# speed up browsing. Using a parent proxy may also be necessary if
# the machine that Privoxy runs on has no direct Internet access.
#
# Note that parent proxies can severely decrease your privacy level.
# For example a parent proxy could add your IP address to the
# request headers and if it's a caching proxy it may add the "Etag"
# header to revalidation requests again, even though you configured
# Privoxy to remove it. It may also ignore Privoxy's header time
# randomization and use the original values which could be used by
# the server as cookie replacement to track your steps between
# visits.
#
# Also specified here are SOCKS proxies. Privoxy supports the SOCKS
# 4 and SOCKS 4A protocols.
#
#
# 5.1. forward
# =============
#
# Specifies:
#
# To which parent HTTP proxy specific requests should be routed.
#
# Type of value:
#
# target_pattern http_parent[:port]
#
# where target_pattern is a URL pattern that specifies to which
# requests (i.e. URLs) this forward rule shall apply. Use / to
# denote "all URLs". http_parent[:port] is the DNS name or IP
# address of the parent HTTP proxy through which the requests
# should be forwarded, optionally followed by its listening port
# (default: 8000). Use a single dot (.) to denote "no
# forwarding".
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# Don't use parent HTTP proxies.
#
# Notes:
#
# If http_parent is ".", then requests are not forwarded to
# another HTTP proxy but are made directly to the web servers.
#
# http_parent can be a numerical IPv6 address (if RFC 3493 is
# implemented). To prevent clashes with the port delimiter, the
# whole IP address has to be put into brackets. On the other
# hand a target_pattern containing an IPv6 address has to be put
# into angle brackets (normal brackets are reserved for regular
# expressions already).
#
# Multiple lines are OK, they are checked in sequence, and the
# last match wins.
#
# Examples:
#
# Everything goes to an example parent proxy, except SSL on port
# 443 (which it doesn't handle):
#
# forward / parent-proxy.example.org:8080
# forward :443 .
#
# Everything goes to our example ISP's caching proxy, except for
# requests to that ISP's sites:
#
# forward / caching-proxy.isp.example.net:8000
# forward .isp.example.net .
#
# Parent proxy specified by an IPv6 address:
#
# forward / [2001:DB8::1]:8000
#
# Suppose your parent proxy doesn't support IPv6:
#
# forward / parent-proxy.example.org:8000
# forward ipv6-server.example.org .
# forward <[2-3][0-9a-f][0-9a-f][0-9a-f]:*> .
#
#
# 5.2. forward-socks4, forward-socks4a, forward-socks5 and forward-socks5t
# =========================================================================
#
# Specifies:
#
# Through which SOCKS proxy (and optionally to which parent HTTP
# proxy) specific requests should be routed.
#
# Type of value:
#
# target_pattern socks_proxy[:port] http_parent[:port]
#
# where target_pattern is a URL pattern that specifies to which
# requests (i.e. URLs) this forward rule shall apply. Use / to
# denote "all URLs". http_parent and socks_proxy are IP
# addresses in dotted decimal notation or valid DNS names (
# http_parent may be "." to denote "no HTTP forwarding"), and
# the optional port parameters are TCP ports, i.e. integer
# values from 1 to 65535
#
# Default value:
#
# Unset
#
# Effect if unset:
#
# Don't use SOCKS proxies.
#
# Notes:
#
# Multiple lines are OK, they are checked in sequence, and the
# last match wins.
#
# The difference between forward-socks4 and forward-socks4a is
# that in the SOCKS 4A protocol, the DNS resolution of the
# target hostname happens on the SOCKS server, while in SOCKS 4
# it happens locally.
#
# With forward-socks5 the DNS resolution will happen on the
# remote server as well.
#
# forward-socks5t works like vanilla forward-socks5 but lets
# Privoxy additionally use Tor-specific SOCKS extensions.
# Currently the only supported SOCKS extension is optimistic
# data which can reduce the latency for the first request made
# on a newly created connection.
#
# socks_proxy and http_parent can be a numerical IPv6 address
# (if RFC 3493 is implemented). To prevent clashes with the port
# delimiter, the whole IP address has to be put into brackets.
# On the other hand a target_pattern containing an IPv6 address
# has to be put into angle brackets (normal brackets are
# reserved for regular expressions already).
#
# If http_parent is ".", then requests are not forwarded to
# another HTTP proxy but are made (HTTP-wise) directly to the
# web servers, albeit through a SOCKS proxy.
#
# Examples:
#
# From the company example.com, direct connections are made to
# all "internal" domains, but everything outbound goes through
# their ISP's proxy by way of example.com's corporate SOCKS 4A
# gateway to the Internet.
#
# forward-socks4a / socks-gw.example.com:1080 www-cache.isp.example.net:8080
# forward .example.com .
#
# A rule that uses a SOCKS 4 gateway for all destinations but no
# HTTP parent looks like this:
#
# forward-socks4 / socks-gw.example.com:1080 .
#
# To chain Privoxy and Tor, both running on the same system, you
# would use something like:
#
# forward-socks5 / 127.0.0.1:9050 .
#
# The public Tor network can't be used to reach your local
# network, if you need to access local servers you therefore
# might want to make some exceptions:
#
# forward 192.168.*.*/ .
# forward 10.*.*.*/ .
# forward 127.*.*.*/ .
#
# Unencrypted connections to systems in these address ranges
# will be as (un)secure as the local network is, but the
# alternative is that you can't reach the local network through
# Privoxy at all. Of course this may actually be desired and
# there is no reason to make these exceptions if you aren't sure
# you need them.
#
# If you also want to be able to reach servers in your local
# network by using their names, you will need additional
# exceptions that look like this:
#
# forward localhost/ .
#
#
# 5.3. forwarded-connect-retries
# ===============================
#
# Specifies:
#
# How often Privoxy retries if a forwarded connection request
# fails.
#
# Type of value:
#
# Number of retries.
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Connections forwarded through other proxies are treated like
# direct connections and no retry attempts are made.
#
# Notes:
#
# forwarded-connect-retries is mainly interesting for socks4a
# connections, where Privoxy can't detect why the connections
# failed. The connection might have failed because of a DNS
# timeout in which case a retry makes sense, but it might also
# have failed because the server doesn't exist or isn't
# reachable. In this case the retry will just delay the
# appearance of Privoxy's error message.
#
# Note that in the context of this option, "forwarded
# connections" includes all connections that Privoxy forwards
# through other proxies. This option is not limited to the HTTP
# CONNECT method.
#
# Only use this option, if you are getting lots of
# forwarding-related error messages that go away when you try
# again manually. Start with a small value and check Privoxy's
# logfile from time to time, to see how many retries are usually
# needed.
#
# Examples:
#
# forwarded-connect-retries 1
#
forwarded-connect-retries 0
#
# 6. MISCELLANEOUS
# =================
#
# 6.1. accept-intercepted-requests
# =================================
#
# Specifies:
#
# Whether intercepted requests should be treated as valid.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Only proxy requests are accepted, intercepted requests are
# treated as invalid.
#
# Notes:
#
# If you don't trust your clients and want to force them to use
# Privoxy, enable this option and configure your packet filter
# to redirect outgoing HTTP connections into Privoxy.
#
# Make sure that Privoxy's own requests aren't redirected as
# well. Additionally take care that Privoxy can't intentionally
# connect to itself, otherwise you could run into redirection
# loops if Privoxy's listening port is reachable by the outside
# or an attacker has access to the pages you visit.
#
# Examples:
#
# accept-intercepted-requests 1
#
accept-intercepted-requests 0
#
# 6.2. allow-cgi-request-crunching
# =================================
#
# Specifies:
#
# Whether requests to Privoxy's CGI pages can be blocked or
# redirected.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Privoxy ignores block and redirect actions for its CGI pages.
#
# Notes:
#
# By default Privoxy ignores block or redirect actions for its
# CGI pages. Intercepting these requests can be useful in
# multi-user setups to implement fine-grained access control,
# but it can also render the complete web interface useless and
# make debugging problems painful if done without care.
#
# Don't enable this option unless you're sure that you really
# need it.
#
# Examples:
#
# allow-cgi-request-crunching 1
#
allow-cgi-request-crunching 0
#
# 6.3. split-large-forms
# =======================
#
# Specifies:
#
# Whether the CGI interface should stay compatible with broken
# HTTP clients.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# The CGI form generate long GET URLs.
#
# Notes:
#
# Privoxy's CGI forms can lead to rather long URLs. This isn't a
# problem as far as the HTTP standard is concerned, but it can
# confuse clients with arbitrary URL length limitations.
#
# Enabling split-large-forms causes Privoxy to divide big forms
# into smaller ones to keep the URL length down. It makes
# editing a lot less convenient and you can no longer submit all
# changes at once, but at least it works around this browser
# bug.
#
# If you don't notice any editing problems, there is no reason
# to enable this option, but if one of the submit buttons
# appears to be broken, you should give it a try.
#
# Examples:
#
# split-large-forms 1
#
split-large-forms 0
#
# 6.4. keep-alive-timeout
# ========================
#
# Specifies:
#
# Number of seconds after which an open connection will no
# longer be reused.
#
# Type of value:
#
# Time in seconds.
#
# Default value:
#
# None
#
# Effect if unset:
#
# Connections are not kept alive.
#
# Notes:
#
# This option allows clients to keep the connection to Privoxy
# alive. If the server supports it, Privoxy will keep the
# connection to the server alive as well. Under certain
# circumstances this may result in speed-ups.
#
# By default, Privoxy will close the connection to the server if
# the client connection gets closed, or if the specified timeout
# has been reached without a new request coming in. This
# behaviour can be changed with the connection-sharing option.
#
# This option has no effect if Privoxy has been compiled without
# keep-alive support.
#
# Note that a timeout of five seconds as used in the default
# configuration file significantly decreases the number of
# connections that will be reused. The value is used because
# some browsers limit the number of connections they open to a
# single host and apply the same limit to proxies. This can
# result in a single website "grabbing" all the connections the
# browser allows, which means connections to other websites
# can't be opened until the connections currently in use time
# out.
#
# Several users have reported this as a Privoxy bug, so the
# default value has been reduced. Consider increasing it to 300
# seconds or even more if you think your browser can handle it.
# If your browser appears to be hanging, it probably can't.
#
# Examples:
#
# keep-alive-timeout 300
#
keep-alive-timeout 5
#
# 6.5. tolerate-pipelining
# =========================
#
# Specifies:
#
# Whether or not pipelined requests should be served.
#
# Type of value:
#
# 0 or 1.
#
# Default value:
#
# None
#
# Effect if unset:
#
# If Privoxy receives more than one request at once, it
# terminates the client connection after serving the first one.
#
# Notes:
#
# Privoxy currently doesn't pipeline outgoing requests, thus
# allowing pipelining on the client connection is not guaranteed
# to improve the performance.
#
# By default Privoxy tries to discourage clients from pipelining
# by discarding aggressively pipelined requests, which forces
# the client to resend them through a new connection.
#
# This option lets Privoxy tolerate pipelining. Whether or not
# that improves performance mainly depends on the client
# configuration.
#
# If you are seeing problems with pages not properly loading,
# disabling this option could work around the problem.
#
# Examples:
#
# tolerate-pipelining 1
#
tolerate-pipelining 1
#
# 6.6. default-server-timeout
# ============================
#
# Specifies:
#
# Assumed server-side keep-alive timeout if not specified by the
# server.
#
# Type of value:
#
# Time in seconds.
#
# Default value:
#
# None
#
# Effect if unset:
#
# Connections for which the server didn't specify the keep-alive
# timeout are not reused.
#
# Notes:
#
# Enabling this option significantly increases the number of
# connections that are reused, provided the keep-alive-timeout
# option is also enabled.
#
# While it also increases the number of connections problems
# when Privoxy tries to reuse a connection that already has been
# closed on the server side, or is closed while Privoxy is
# trying to reuse it, this should only be a problem if it
# happens for the first request sent by the client. If it
# happens for requests on reused client connections, Privoxy
# will simply close the connection and the client is supposed to
# retry the request without bothering the user.
#
# Enabling this option is therefore only recommended if the
# connection-sharing option is disabled.
#
# It is an error to specify a value larger than the
# keep-alive-timeout value.
#
# This option has no effect if Privoxy has been compiled without
# keep-alive support.
#
# Examples:
#
# default-server-timeout 60
#
#default-server-timeout 60
#
# 6.7. connection-sharing
# ========================
#
# Specifies:
#
# Whether or not outgoing connections that have been kept alive
# should be shared between different incoming connections.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# None
#
# Effect if unset:
#
# Connections are not shared.
#
# Notes:
#
# This option has no effect if Privoxy has been compiled without
# keep-alive support, or if it's disabled.
#
# Notes:
#
# Note that reusing connections doesn't necessary cause
# speedups. There are also a few privacy implications you should
# be aware of.
#
# If this option is effective, outgoing connections are shared
# between clients (if there are more than one) and closing the
# browser that initiated the outgoing connection does no longer
# affect the connection between Privoxy and the server unless
# the client's request hasn't been completed yet.
#
# If the outgoing connection is idle, it will not be closed
# until either Privoxy's or the server's timeout is reached.
# While it's open, the server knows that the system running
# Privoxy is still there.
#
# If there are more than one client (maybe even belonging to
# multiple users), they will be able to reuse each others
# connections. This is potentially dangerous in case of
# authentication schemes like NTLM where only the connection is
# authenticated, instead of requiring authentication for each
# request.
#
# If there is only a single client, and if said client can keep
# connections alive on its own, enabling this option has next to
# no effect. If the client doesn't support connection
# keep-alive, enabling this option may make sense as it allows
# Privoxy to keep outgoing connections alive even if the client
# itself doesn't support it.
#
# You should also be aware that enabling this option increases
# the likelihood of getting the "No server or forwarder data"
# error message, especially if you are using a slow connection
# to the Internet.
#
# This option should only be used by experienced users who
# understand the risks and can weight them against the benefits.
#
# Examples:
#
# connection-sharing 1
#
#connection-sharing 1
#
# 6.8. socket-timeout
# ====================
#
# Specifies:
#
# Number of seconds after which a socket times out if no data is
# received.
#
# Type of value:
#
# Time in seconds.
#
# Default value:
#
# None
#
# Effect if unset:
#
# A default value of 300 seconds is used.
#
# Notes:
#
# The default is quite high and you probably want to reduce it.
# If you aren't using an occasionally slow proxy like Tor,
# reducing it to a few seconds should be fine.
#
# Examples:
#
# socket-timeout 300
#
socket-timeout 300
#
# 6.9. max-client-connections
# ============================
#
# Specifies:
#
# Maximum number of client connections that will be served.
#
# Type of value:
#
# Positive number.
#
# Default value:
#
# 128
#
# Effect if unset:
#
# Connections are served until a resource limit is reached.
#
# Notes:
#
# Privoxy creates one thread (or process) for every incoming
# client connection that isn't rejected based on the access
# control settings.
#
# If the system is powerful enough, Privoxy can theoretically
# deal with several hundred (or thousand) connections at the
# same time, but some operating systems enforce resource limits
# by shutting down offending processes and their default limits
# may be below the ones Privoxy would require under heavy load.
#
# Configuring Privoxy to enforce a connection limit below the
# thread or process limit used by the operating system makes
# sure this doesn't happen. Simply increasing the operating
# system's limit would work too, but if Privoxy isn't the only
# application running on the system, you may actually want to
# limit the resources used by Privoxy.
#
# If Privoxy is only used by a single trusted user, limiting the
# number of client connections is probably unnecessary. If there
# are multiple possibly untrusted users you probably still want
# to additionally use a packet filter to limit the maximal
# number of incoming connections per client. Otherwise a
# malicious user could intentionally create a high number of
# connections to prevent other users from using Privoxy.
#
# Obviously using this option only makes sense if you choose a
# limit below the one enforced by the operating system.
#
# One most POSIX-compliant systems Privoxy can't properly deal
# with more than FD_SETSIZE file descriptors at the same time
# and has to reject connections if the limit is reached. This
# will likely change in a future version, but currently this
# limit can't be increased without recompiling Privoxy with a
# different FD_SETSIZE limit.
#
# Examples:
#
# max-client-connections 256
#
#max-client-connections 256
#
# 6.10. handle-as-empty-doc-returns-ok
# =====================================
#
# Specifies:
#
# The status code Privoxy returns for pages blocked with
# +handle-as-empty-document.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Privoxy returns a status 403(forbidden) for all blocked pages.
#
# Effect if set:
#
# Privoxy returns a status 200(OK) for pages blocked with
# +handle-as-empty-document and a status 403(Forbidden) for all
# other blocked pages.
#
# Notes:
#
# This is a work-around for Firefox bug 492459: " Websites are
# no longer rendered if SSL requests for JavaScripts are blocked
# by a proxy. " (https://bugzilla.mozilla.org/show_bug.cgi?id=
# 492459) As the bug has been fixed for quite some time this
# option should no longer be needed and will be removed in a
# future release. Please speak up if you have a reason why the
# option should be kept around.
#
#handle-as-empty-doc-returns-ok 1
#
# 6.11. enable-compression
# =========================
#
# Specifies:
#
# Whether or not buffered content is compressed before delivery.
#
# Type of value:
#
# 0 or 1
#
# Default value:
#
# 0
#
# Effect if unset:
#
# Privoxy does not compress buffered content.
#
# Effect if set:
#
# Privoxy compresses buffered content before delivering it to
# the client, provided the client supports it.
#
# Notes:
#
# This directive is only supported if Privoxy has been compiled
# with FEATURE_COMPRESSION, which should not to be confused with
# FEATURE_ZLIB.
#
# Compressing buffered content is mainly useful if Privoxy and
# the client are running on different systems. If they are
# running on the same system, enabling compression is likely to
# slow things down. If you didn't measure otherwise, you should
# assume that it does and keep this option disabled.
#
# Privoxy will not compress buffered content below a certain
# length.
#
#enable-compression 1
#
# 6.12. compression-level
# ========================
#
# Specifies:
#
# The compression level that is passed to the zlib library when
# compressing buffered content.
#
# Type of value:
#
# Positive number ranging from 0 to 9.
#
# Default value:
#
# 1
#
# Notes:
#
# Compressing the data more takes usually longer than
# compressing it less or not compressing it at all. Which level
# is best depends on the connection between Privoxy and the
# client. If you can't be bothered to benchmark it for yourself,
# you should stick with the default and keep compression
# disabled.
#
# If compression is disabled, the compression level is
# irrelevant.
#
# Examples:
#
# # Best speed (compared to the other levels)
# compression-level 1
#
# # Best compression
# compression-level 9
#
# # No compression. Only useful for testing as the added header
# # slightly increases the amount of data that has to be sent.
# # If your benchmark shows that using this compression level
# # is superior to using no compression at all, the benchmark
# # is likely to be flawed.
# compression-level 0
#
#
#compression-level 1
#
# 6.13. client-header-order
# ==========================
#
# Specifies:
#
# The order in which client headers are sorted before forwarding
# them.
#
# Type of value:
#
# Client header names delimited by spaces or tabs
#
# Default value:
#
# None
#
# Notes:
#
# By default Privoxy leaves the client headers in the order they
# were sent by the client. Headers are modified in-place, new
# headers are added at the end of the already existing headers.
#
# The header order can be used to fingerprint client requests
# independently of other headers like the User-Agent.
#
# This directive allows to sort the headers differently to
# better mimic a different User-Agent. Client headers will be
# emitted in the order given, headers whose name isn't
# explicitly specified are added at the end.
#
# Note that sorting headers in an uncommon way will make
# fingerprinting actually easier. Encrypted headers are not
# affected by this directive.
#
#client-header-order Host \
# Accept \
# Accept-Language \
# Accept-Encoding \
# Proxy-Connection \
# Referer \
# Cookie \
# DNT \
# If-Modified-Since \
# Cache-Control \
# Content-Length \
# Content-Type
#
#
# 7. WINDOWS GUI OPTIONS
# =======================
#
# Privoxy has a number of options specific to the Windows GUI
# interface:
#
#
#
# If "activity-animation" is set to 1, the Privoxy icon will animate
# when "Privoxy" is active. To turn off, set to 0.
#
#activity-animation 1
#
#
#
# If "log-messages" is set to 1, Privoxy copies log messages to the
# console window. The log detail depends on the debug directive.
#
#log-messages 1
#
#
#
# If "log-buffer-size" is set to 1, the size of the log buffer, i.e.
# the amount of memory used for the log messages displayed in the
# console window, will be limited to "log-max-lines" (see below).
#
# Warning: Setting this to 0 will result in the buffer to grow
# infinitely and eat up all your memory!
#
#log-buffer-size 1
#
#
#
# log-max-lines is the maximum number of lines held in the log
# buffer. See above.
#
#log-max-lines 200
#
#
#
# If "log-highlight-messages" is set to 1, Privoxy will highlight
# portions of the log messages with a bold-faced font:
#
#log-highlight-messages 1
#
#
#
# The font used in the console window:
#
#log-font-name Comic Sans MS
#
#
#
# Font size used in the console window:
#
#log-font-size 8
#
#
#
# "show-on-task-bar" controls whether or not Privoxy will appear as
# a button on the Task bar when minimized:
#
#show-on-task-bar 0
#
#
#
# If "close-button-minimizes" is set to 1, the Windows close button
# will minimize Privoxy instead of closing the program (close with
# the exit option on the File menu).
#
#close-button-minimizes 1
#
#
#
# The "hide-console" option is specific to the MS-Win console
# version of Privoxy. If this option is used, Privoxy will
# disconnect from and hide the command console.
#
#hide-console
#
#
#
privoxy-3.0.21-stable/./encode.h 000640 001751 001751 00000004501 11655471470 015402 0 ustar 00fk fk 000000 000000 #ifndef ENCODE_H_INCLUDED
#define ENCODE_H_INCLUDED
#define ENCODE_H_VERSION "$Id: encode.h,v 1.12 2011/11/06 11:44:56 fabiankeil Exp $"
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/encode.h,v $
*
* Purpose : Functions to encode and decode URLs, and also to
* encode cookies and HTML text.
*
* Copyright : Written by and Copyright (C) 2001 the SourceForge
* Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
* by and Copyright (C) 1997 Anonymous Coders and
* Junkbusters Corporation. http://www.junkbusters.com
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*********************************************************************/
#ifdef __cplusplus
extern "C" {
#endif
extern char * html_encode(const char *s);
extern char * url_encode(const char *s);
extern char * url_decode(const char *str);
extern int xtoi(const char *s);
extern char * html_encode_and_free_original(char *s);
extern char * percent_encode_url(const char *s);
/* Revision control strings from this header and associated .c file */
extern const char encode_rcs[];
extern const char encode_h_rcs[];
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* ndef ENCODE_H_INCLUDED */
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./w32taskbar.h 000640 001751 001751 00000004331 11630656300 016117 0 ustar 00fk fk 000000 000000 #ifndef W32TASKBAR_H_INCLUDED
#define W32TASKBAR_H_INCLUDED
#define W32TASKBAR_H_VERSION "$Id: w32taskbar.h,v 1.8 2011/09/04 11:10:56 fabiankeil Exp $"
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/w32taskbar.h,v $
*
* Purpose : Functions for creating, setting and destroying the
* workspace tray icon
*
* Copyright : Written by and Copyright (C) 2001-2002 members of
* the Privoxy team. http://www.privoxy.org/
*
* Written by and Copyright (C) 1999 Adam Lock
*
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*********************************************************************/
#ifdef __cplusplus
extern "C" {
#endif
extern HWND CreateTrayWindow(HINSTANCE hInstance);
extern BOOL TrayAddIcon(HWND hwnd, UINT uID, HICON hicon, const char *pszToolTip);
extern BOOL TraySetIcon(HWND hwnd, UINT uID, HICON hicon);
extern BOOL TrayDeleteIcon(HWND hwnd, UINT uID);
/* Revision control strings from this header and associated .c file */
extern const char w32taskbar_rcs[];
extern const char w32taskbar_h_rcs[];
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* ndef W32TASKBAR_H_INCLUDED */
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./deanimate.c 000640 001751 001751 00000030554 11726427304 016072 0 ustar 00fk fk 000000 000000 const char deanimate_rcs[] = "$Id: deanimate.c,v 1.23 2012/03/09 16:24:36 fabiankeil Exp $";
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/deanimate.c,v $
*
* Purpose : Declares functions to manipulate binary images on the
* fly. High-level functions include:
* - Deanimation of GIF images
*
* Copyright : Written by and Copyright (C) 2001 - 2004, 2006 by the
* SourceForge Privoxy team. http://www.privoxy.org/
*
* Based on the GIF file format specification (see
* http://tronche.com/computer-graphics/gif/gif89a.html)
* and ideas from the Image::DeAnim Perl module by
* Ken MacFarlane,
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
**********************************************************************/
#include "config.h"
#include
#include
#include "errlog.h"
#include "project.h"
#include "deanimate.h"
#include "miscutil.h"
const char deanimate_h_rcs[] = DEANIMATE_H_VERSION;
/*********************************************************************
*
* Function : buf_free
*
* Description : Safely frees a struct binbuffer
*
* Parameters :
* 1 : buf = Pointer to the binbuffer to be freed
*
* Returns : N/A
*
*********************************************************************/
void buf_free(struct binbuffer *buf)
{
if (buf == NULL) return;
if (buf->buffer != NULL)
{
free(buf->buffer);
}
free(buf);
}
/*********************************************************************
*
* Function : buf_extend
*
* Description : Ensure that a given binbuffer can hold a given amount
* of bytes, by reallocating its buffer if necessary.
* Allocate new mem in chunks of 1024 bytes, so we don't
* have to realloc() too often.
*
* Parameters :
* 1 : buf = Pointer to the binbuffer
* 2 : length = Desired minimum size
*
*
* Returns : 0 on success, 1 on failure.
*
*********************************************************************/
static int buf_extend(struct binbuffer *buf, size_t length)
{
char *newbuf;
if (buf->offset + length > buf->size)
{
buf->size = ((buf->size + length + (size_t)1023) & ~(size_t)1023);
newbuf = (char *)realloc(buf->buffer, buf->size);
if (newbuf == NULL)
{
freez(buf->buffer);
return 1;
}
else
{
buf->buffer = newbuf;
return 0;
}
}
return 0;
}
/*********************************************************************
*
* Function : buf_copy
*
* Description : Safely copies a given amount of bytes from one
* struct binbuffer to another, advancing the
* offsets appropriately.
*
* Parameters :
* 1 : src = Pointer to the source binbuffer
* 2 : dst = Pointer to the destination binbuffer
* 3 : length = Number of bytes to be copied
*
* Returns : 0 on success, 1 on failure.
*
*********************************************************************/
static int buf_copy(struct binbuffer *src, struct binbuffer *dst, size_t length)
{
/*
* Sanity check: Can't copy more data than we have
*/
if (src->offset + length > src->size)
{
return 1;
}
/*
* Ensure that dst can hold the new data
*/
if (buf_extend(dst, length))
{
return 1;
}
/*
* Now that it's safe, memcpy() the desired amount of
* data from src to dst and adjust the offsets
*/
memcpy(dst->buffer + dst->offset, src->buffer + src->offset, length);
src->offset += length;
dst->offset += length;
return 0;
}
/*********************************************************************
*
* Function : buf_getbyte
*
* Description : Safely gets a byte from a given binbuffer at a
* given offset
*
* Parameters :
* 1 : src = Pointer to the source binbuffer
* 2 : offset = Offset to the desired byte
*
* Returns : The byte on success, or 0 on failure
*
*********************************************************************/
static unsigned char buf_getbyte(const struct binbuffer *src, size_t offset)
{
if (src->offset + offset < src->size)
{
return (unsigned char)*(src->buffer + src->offset + offset);
}
else
{
return '\0';
}
}
/*********************************************************************
*
* Function : gif_skip_data_block
*
* Description : Safely advances the offset of a given struct binbuffer
* that contains a GIF image and whose offset is
* positioned at the start of a data block, behind
* that block.
*
* Parameters :
* 1 : buf = Pointer to the binbuffer
*
* Returns : 0 on success, or 1 on failure
*
*********************************************************************/
static int gif_skip_data_block(struct binbuffer *buf)
{
unsigned char c;
/*
* Data blocks are sequences of chunks, which are headed
* by a one-byte length field, with the last chunk having
* zero length.
*/
while((c = buf_getbyte(buf, 0)) != '\0')
{
buf->offset += (size_t)c + 1;
if (buf->offset >= buf->size - 1)
{
return 1;
}
}
buf->offset++;
return 0;
}
/*********************************************************************
*
* Function : gif_extract_image
*
* Description : Safely extracts an image data block from a given
* struct binbuffer that contains a GIF image and whose
* offset is positioned at the start of a data block
* into a given destination binbuffer.
*
* Parameters :
* 1 : src = Pointer to the source binbuffer
* 2 : dst = Pointer to the destination binbuffer
*
* Returns : 0 on success, or 1 on failure
*
*********************************************************************/
static int gif_extract_image(struct binbuffer *src, struct binbuffer *dst)
{
unsigned char c;
/*
* Remember the colormap flag and copy the image head
*/
c = buf_getbyte(src, 9);
if (buf_copy(src, dst, 10))
{
return 1;
}
/*
* If the image has a local colormap, copy it.
*/
if (c & 0x80)
{
int map_length = 3 * (1 << ((c & 0x07) + 1));
if (map_length <= 0)
{
log_error(LOG_LEVEL_DEANIMATE,
"colormap length = %d (%c)?", map_length, c);
return 1;
}
if (buf_copy(src, dst, (size_t)map_length))
{
return 1;
}
}
if (buf_copy(src, dst, 1)) return 1;
/*
* Copy the image chunk by chunk.
*/
while((c = buf_getbyte(src, 0)) != '\0')
{
if (buf_copy(src, dst, 1 + (size_t) c)) return 1;
}
if (buf_copy(src, dst, 1)) return 1;
/*
* Trim and rewind the dst buffer
*/
if (NULL == (dst->buffer = (char *)realloc(dst->buffer, dst->offset))) return 1;
dst->size = dst->offset;
dst->offset = 0;
return(0);
}
/*********************************************************************
*
* Function : gif_deanimate
*
* Description : Deanimate a given GIF image, i.e. given a GIF with
* an (optional) image block and an arbitrary number
* of image extension blocks, produce an output GIF with
* only one image block that contains the last image
* (extenstion) block of the original.
* Also strip Comments, Application extenstions, etc.
*
* Parameters :
* 1 : src = Pointer to the source binbuffer
* 2 : dst = Pointer to the destination binbuffer
* 3 : get_first_image = Flag: If set, get the first image
* If unset (default), get the last
*
* Returns : 0 on success, or 1 on failure
*
*********************************************************************/
int gif_deanimate(struct binbuffer *src, struct binbuffer *dst, int get_first_image)
{
unsigned char c;
struct binbuffer *image;
if (NULL == src || NULL == dst)
{
return 1;
}
c = buf_getbyte(src, 10);
/*
* Check & copy GIF header
*/
if (strncmp(src->buffer, "GIF89a", 6) && strncmp(src->buffer, "GIF87a", 6))
{
return 1;
}
else
{
if (buf_copy(src, dst, 13))
{
return 1;
}
}
/*
* Look for global colormap and copy if found.
*/
if (c & 0x80)
{
int map_length = 3 * (1 << ((c & 0x07) + 1));
if (map_length <= 0)
{
log_error(LOG_LEVEL_DEANIMATE,
"colormap length = %d (%c)?", map_length, c);
return 1;
}
if (buf_copy(src, dst, (size_t)map_length))
{
return 1;
}
}
/*
* Reserve a buffer for the current image block
*/
if (NULL == (image = (struct binbuffer *)zalloc(sizeof(*image))))
{
return 1;
}
/*
* Parse the GIF block by block and copy the relevant
* parts to dst
*/
while(src->offset < src->size)
{
switch(buf_getbyte(src, 0))
{
/*
* End-of-GIF Marker: Append current image and return
*/
case 0x3b:
goto write;
/*
* Image block: Extract to current image buffer.
*/
case 0x2c:
image->offset = 0;
if (gif_extract_image(src, image)) goto failed;
if (get_first_image) goto write;
continue;
/*
* Extension block: Look at next byte and decide
*/
case 0x21:
switch (buf_getbyte(src, 1))
{
/*
* Image extension: Copy extension header and image
* to the current image buffer
*/
case 0xf9:
image->offset = 0;
if (buf_copy(src, image, 8) || buf_getbyte(src, 0) != 0x2c) goto failed;
if (gif_extract_image(src, image)) goto failed;
if (get_first_image) goto write;
continue;
/*
* Application extension: Skip
*/
case 0xff:
if ((src->offset += 14) >= src->size || gif_skip_data_block(src)) goto failed;
continue;
/*
* Comment extension: Skip
*/
case 0xfe:
if ((src->offset += 2) >= src->size || gif_skip_data_block(src)) goto failed;
continue;
/*
* Plain text extension: Skip
*/
case 0x01:
if ((src->offset += 15) >= src->size || gif_skip_data_block(src)) goto failed;
continue;
/*
* Ooops, what type of extension is that?
*/
default:
goto failed;
}
/*
* Ooops, what type of block is that?
*/
default:
goto failed;
}
} /* -END- while src */
/*
* Either we got here by goto, or because the GIF is
* bogus and EOF was reached before an end-of-gif marker
* was found.
*/
failed:
buf_free(image);
return 1;
/*
* Append the current image to dst and return
*/
write:
if (buf_copy(image, dst, image->size)) goto failed;
if (buf_extend(dst, 1)) goto failed;
*(dst->buffer + dst->offset++) = 0x3b;
buf_free(image);
return 0;
}
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./ssplit.h 000640 001751 001751 00000004135 11764413377 015471 0 ustar 00fk fk 000000 000000 #ifndef SSPLIT_H_INCLUDED
#define SSPLIT_H_INCLUDED
#define SSPLIT_H_VERSION "$Id: ssplit.h,v 1.11 2012/06/08 15:15:11 fabiankeil Exp $"
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/ssplit.h,v $
*
* Purpose : A function to split a string at specified deliminters.
*
* Copyright : Written by and Copyright (C) 2001 the SourceForge
* Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
* by and Copyright (C) 1997 Anonymous Coders and
* Junkbusters Corporation. http://www.junkbusters.com
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*********************************************************************/
#ifdef __cplusplus
extern "C" {
#endif
extern int ssplit(char *str, const char *delim, char *vec[], size_t vec_len);
/* Revision control strings from this header and associated .c file */
extern const char ssplit_rcs[];
extern const char ssplit_h_rcs[];
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* ndef SSPLIT_H_INCLUDED */
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./user.action 000640 001751 001751 00000021132 11655470441 016145 0 ustar 00fk fk 000000 000000 ######################################################################
#
# File : $Source: /cvsroot/ijbswa/current/user.action,v $
#
# $Id: user.action,v 1.13 2011/11/06 11:36:01 fabiankeil Exp $
#
# Purpose : User-maintained actions file, see
# http://www.privoxy.org/user-manual/actions-file.html
#
######################################################################
# This is the place to add your personal exceptions and additions to
# the general policies as defined in default.action. (Here they will be
# safe from updates to default.action.) Later defined actions always
# take precedence, so anything defined here should have the last word.
# See http://www.privoxy.org/user-manual/actions-file.html, or the
# comments in default.action, for an explanation of what an "action" is
# and what each action does.
# The examples included here either use bogus sites, or have the actual
# rules commented out (with the '#' character). Useful aliases are
# included in the top section as a convenience.
#############################################################################
# Aliases
#############################################################################
{{alias}}
#############################################################################
#
# You can define a short form for a list of permissions - e.g., instead
# of "-crunch-incoming-cookies -crunch-outgoing-cookies -filter -fast-redirects",
# you can just write "shop". This is called an alias.
#
# Currently, an alias can contain any character except space, tab, '=', '{'
# or '}'.
# But please use only 'a'-'z', '0'-'9', '+', and '-'.
#
# Alias names are not case sensitive.
#
# Aliases beginning with '+' or '-' may be used for system action names
# in future releases - so try to avoid alias names like this. (e.g.
# "+crunch-all-cookies" below is not a good name)
#
# Aliases must be defined before they are used.
#
# These aliases just save typing later:
#
+crunch-all-cookies = +crunch-incoming-cookies +crunch-outgoing-cookies
-crunch-all-cookies = -crunch-incoming-cookies -crunch-outgoing-cookies
allow-all-cookies = -crunch-all-cookies -session-cookies-only -filter{content-cookies}
allow-popups = -filter{all-popups} -filter{unsolicited-popups}
+block-as-image = +block{Blocked image request.} +handle-as-image
-block-as-image = -block
# These aliases define combinations of actions
# that are useful for certain types of sites:
#
fragile = -block -crunch-all-cookies -filter -fast-redirects -hide-referer -prevent-compression
shop = -crunch-all-cookies allow-popups
# Your favourite blend of filters:
#
myfilters = +filter{html-annoyances} +filter{js-annoyances} +filter{all-popups}\
+filter{webbugs} +filter{banners-by-size}
# Allow ads for selected useful free sites:
#
allow-ads = -block -filter{banners-by-size} -filter{banners-by-link}
#... etc. Customize to your heart's content.
## end aliases ########################################################
#######################################################################
# Begin examples: #####################################################
# Say you have accounts on some sites that you visit regularly, and you
# don't want to have to log in manually each time. So you'd like to allow
# persistent cookies for these sites. The allow-all-cookies alias defined
# above does exactly that, i.e. it disables crunching of cookies in any
# direction, and the processing of cookies to make them only temporary.
#
{ allow-all-cookies }
#.sourceforge.net
#sunsolve.sun.com
#slashdot.org
#.yahoo.com
#.msdn.microsoft.com
#.redhat.com
# Say the site where you do your homebanking needs to open popup
# windows, but you have chosen to kill popups uncoditionally by default.
# This will allow it for your-example-bank.com:
#
{ -filter{all-popups} }
.banking.example.com
# Some hosts and some file types you may not want to filter for
# various reasons:
#
{ -filter }
# Technical documentation is likely to contain strings that might
# erroneously get altered by the JavaScript-oriented filters:
#
#.tldp.org
#/(.*/)?selfhtml/
# And this stupid host sends streaming video with a wrong MIME type,
# so that Privoxy thinks it is getting HTML and starts filtering:
#
stupid-server.example.com/
# Example of a simple "block" action. Say you've seen an ad on your
# favourite page on example.com that you want to get rid of. You have
# right-clicked the image, selected "copy image location" and pasted
# the URL below while removing the leading http://, into a { +block{reason} }
# section. Note that { +handle-as-image } need not be specified, since
# all URLs ending in .gif will be tagged as images by the general rules
# as set in default.action anyway:
#
{ +block{Nasty ads.} }
www.example.com/nasty-ads/sponsor.gif
# The URLs of dynamically generated banners, especially from large banner
# farms, often don't use the well-known image file name extensions, which
# makes it impossible for Privoxy to guess the file type just by looking
# at the URL.
# You can use the +block-as-image alias defined above for these cases.
# Note that objects which match this rule but then turn out NOT to be an
# image are typically rendered as a "broken image" icon by the browser.
# Use cautiously.
#
{ +block-as-image }
#.doubleclick.net
#/Realmedia/ads/
#ar.atwola.com/
# Now you noticed that the default configuration breaks Forbes
# Magazine, but you were too lazy to find out which action is the
# culprit, and you were again too lazy to give feedback, so you just
# used the fragile alias on the site, and -- whoa! -- it worked. The
# 'fragile' aliases disables those actions that are most likely to break
# a site. Also, good for testing purposes to see if it is Privoxy that
# is causing the problem or not.
#
{ fragile }
#.forbes.com
# Here are some sites we wish to support, and we will allow their ads
# through.
#
{ allow-ads }
#.sourceforge.net
#.slashdot.org
#.osdn.net
# user.action is generally the best place to define exceptions and
# additions to the default policies of default.action. Some actions are
# safe to have their default policies set here though. So let's set a
# default policy to have a 'blank' image as opposed to the checkerboard
# pattern for ALL sites. '/' of course matches all URLs.
# patterns:
#
{ +set-image-blocker{blank} }
#/
# Enable the following section (not the regression-test directives)
# to rewrite and redirect click-tracking URLs on news.google.com.
# Disabling JavaScript should work as well and probably works more reliably.
#
# Redirected URL = http://news.google.com/news/url?ct2=us%2F0_0_s_1_1_a&sa=t&usg=AFQjCNHJWPc7ffoSXPSqBRz55jDA0KgxOQ&cid=8797762374160&url=http%3A%2F%2Fonline.wsj.com%2Farticle%2FSB10001424052970204485304576640791304008536.html&ei=YcqeTsymCIjxggf8uQE&rt=HOMEPAGE&vm=STANDARD&bvm=section&did=-6537064229385238098
# Redirect Destination = http://online.wsj.com/article/SB10001424052970204485304576640791304008536.html
# Ignore = Yes
#
#{+fast-redirects{check-decoded-url}}
#news.google.com/news/url.*&url=http.*&
# Enable the following section (not the regression-test directives)
# to block various Facebook "like" and similar tracking URLs. At the
# time this section was added it was reported to not break Facebook
# itself but this may have changed by the time you read this. This URL
# list is probably incomplete and if you don't have an account anyway,
# you may prefer to block the whole domain.
#
# Blocked URL = http://www.facebook.com/plugins/likebox.php?href=http%3A%2F%2Ffacebook.com%2Farstechnica&width=300&colorscheme=light&show_faces=false&stream=false&header=false&height=62&border_color=%23FFFFFF
# Ignore = Yes
# Blocked URL = http://www.facebook.com/plugins/activity.php?site=arstechnica.com&width=300&height=370&header=false&colorscheme=light&recommendations=false&border_color=%23FFFFFF
# Ignore = Yes
# Blocked URL = http://www.facebook.com/plugins/fan.php?api_key=368513495882&connections=10&height=250&id=8304333127&locale=en_US&sdk=joey&stream=false&width=377
# Ignore = Yes
# Blocked URL = http://www.facebook.com/plugins/like.php?api_key=368513495882&channel_url=http%3A%2F%2Fstatic.ak.fbcdn.net%2Fconnect%2Fxd_proxy.php%3Fversion%3D3%23cb%3Df13997452c%26origin%3Dhttp%253A%252F%252Fonline.wsj.com%252Ff1b037e354%26relation%3Dparent.parent%26transport%3Dpostmessage&extended_social_context=false&href=http%3A%2F%2Fonline.wsj.com%2Farticle%2FSB10001424052970204485304576640791304008536.html&layout=button_count&locale=en_US&node_type=link&ref=wsj_share_FB&sdk=joey&send=false&show_faces=false&width=90
# Ignore = Yes
#
#{+block{Facebook "like" and similar tracking URLs.}}
#www.facebook.com/(extern|plugins)/(login_status|like(box)?|activity|fan)\.php
privoxy-3.0.21-stable/./actionlist.h 000640 001751 001751 00000025511 12047716035 016315 0 ustar 00fk fk 000000 000000 /*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/actionlist.h,v $
*
* Purpose : Master list of supported actions.
* Not really a header, since it generates code.
* This is included (3 times!) from actions.c
* Each time, the following macros are defined to
* suitable values beforehand:
* DEFINE_ACTION_MULTI()
* DEFINE_ACTION_STRING()
* DEFINE_ACTION_BOOL()
* DEFINE_ACTION_ALIAS
*
* Copyright : Written by and Copyright (C) 2001-2008 the
* Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
* by and Copyright (C) 1997 Anonymous Coders and
* Junkbusters Corporation. http://www.junkbusters.com
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*********************************************************************/
#if !(defined(DEFINE_ACTION_BOOL) && defined(DEFINE_ACTION_MULTI) && defined(DEFINE_ACTION_STRING))
#error Please define lots of macros before including "actionlist.h".
#endif /* !defined(all the DEFINE_ACTION_xxx macros) */
#ifndef DEFINE_CGI_PARAM_RADIO
#define DEFINE_CGI_PARAM_RADIO(name, bit, index, value, is_default)
#define DEFINE_CGI_PARAM_CUSTOM(name, bit, index, default_val)
#define DEFINE_CGI_PARAM_NO_RADIO(name, bit, index, default_val)
#endif /* ndef DEFINE_CGI_PARAM_RADIO */
DEFINE_ACTION_MULTI ("add-header", ACTION_MULTI_ADD_HEADER)
DEFINE_ACTION_STRING ("block", ACTION_BLOCK, ACTION_STRING_BLOCK)
DEFINE_CGI_PARAM_NO_RADIO("block", ACTION_BLOCK, ACTION_STRING_BLOCK, "No reason specified.")
DEFINE_ACTION_STRING ("change-x-forwarded-for", ACTION_CHANGE_X_FORWARDED_FOR, ACTION_STRING_CHANGE_X_FORWARDED_FOR)
DEFINE_CGI_PARAM_RADIO ("change-x-forwarded-for", ACTION_CHANGE_X_FORWARDED_FOR, ACTION_STRING_CHANGE_X_FORWARDED_FOR, "block", 0)
DEFINE_CGI_PARAM_RADIO ("change-x-forwarded-for", ACTION_CHANGE_X_FORWARDED_FOR, ACTION_STRING_CHANGE_X_FORWARDED_FOR, "add", 1)
DEFINE_ACTION_MULTI ("client-header-filter", ACTION_MULTI_CLIENT_HEADER_FILTER)
DEFINE_ACTION_MULTI ("client-header-tagger", ACTION_MULTI_CLIENT_HEADER_TAGGER)
DEFINE_ACTION_STRING ("content-type-overwrite", ACTION_CONTENT_TYPE_OVERWRITE, ACTION_STRING_CONTENT_TYPE)
DEFINE_CGI_PARAM_NO_RADIO("content-type-overwrite", ACTION_CONTENT_TYPE_OVERWRITE, ACTION_STRING_CONTENT_TYPE, "text/html")
DEFINE_ACTION_STRING ("crunch-client-header", ACTION_CRUNCH_CLIENT_HEADER, ACTION_STRING_CLIENT_HEADER)
DEFINE_CGI_PARAM_NO_RADIO("crunch-client-header", ACTION_CRUNCH_CLIENT_HEADER, ACTION_STRING_CLIENT_HEADER, "X-Whatever:")
DEFINE_ACTION_BOOL ("crunch-if-none-match", ACTION_CRUNCH_IF_NONE_MATCH)
DEFINE_ACTION_BOOL ("crunch-incoming-cookies", ACTION_CRUNCH_INCOMING_COOKIES)
DEFINE_ACTION_BOOL ("crunch-outgoing-cookies", ACTION_CRUNCH_OUTGOING_COOKIES)
DEFINE_ACTION_STRING ("crunch-server-header", ACTION_CRUNCH_SERVER_HEADER, ACTION_STRING_SERVER_HEADER)
DEFINE_CGI_PARAM_NO_RADIO("crunch-server-header", ACTION_CRUNCH_SERVER_HEADER, ACTION_STRING_SERVER_HEADER, "X-Whatever:")
DEFINE_ACTION_STRING ("deanimate-gifs", ACTION_DEANIMATE, ACTION_STRING_DEANIMATE)
DEFINE_CGI_PARAM_RADIO ("deanimate-gifs", ACTION_DEANIMATE, ACTION_STRING_DEANIMATE, "first", 0)
DEFINE_CGI_PARAM_RADIO ("deanimate-gifs", ACTION_DEANIMATE, ACTION_STRING_DEANIMATE, "last", 1)
DEFINE_ACTION_BOOL ("downgrade-http-version", ACTION_DOWNGRADE)
#ifdef FEATURE_FAST_REDIRECTS
DEFINE_ACTION_STRING ("fast-redirects", ACTION_FAST_REDIRECTS, ACTION_STRING_FAST_REDIRECTS)
DEFINE_CGI_PARAM_RADIO ("fast-redirects", ACTION_FAST_REDIRECTS, ACTION_STRING_FAST_REDIRECTS, "simple-check", 0)
DEFINE_CGI_PARAM_RADIO ("fast-redirects", ACTION_FAST_REDIRECTS, ACTION_STRING_FAST_REDIRECTS, "check-decoded-url", 1)
#endif /* def FEATURE_FAST_REDIRECTS */
DEFINE_ACTION_MULTI ("filter", ACTION_MULTI_FILTER)
DEFINE_ACTION_BOOL ("force-text-mode", ACTION_FORCE_TEXT_MODE)
DEFINE_ACTION_STRING ("forward-override", ACTION_FORWARD_OVERRIDE, ACTION_STRING_FORWARD_OVERRIDE)
DEFINE_CGI_PARAM_CUSTOM ("forward-override", ACTION_FORWARD_OVERRIDE, ACTION_STRING_FORWARD_OVERRIDE, "forward .")
DEFINE_ACTION_BOOL ("handle-as-empty-document", ACTION_HANDLE_AS_EMPTY_DOCUMENT)
DEFINE_ACTION_BOOL ("handle-as-image", ACTION_IMAGE)
DEFINE_ACTION_STRING ("hide-accept-language", ACTION_HIDE_ACCEPT_LANGUAGE, ACTION_STRING_LANGUAGE)
DEFINE_CGI_PARAM_RADIO ("hide-accept-language", ACTION_HIDE_ACCEPT_LANGUAGE, ACTION_STRING_LANGUAGE, "block", 0)
DEFINE_CGI_PARAM_CUSTOM ("hide-accept-language", ACTION_HIDE_ACCEPT_LANGUAGE, ACTION_STRING_LANGUAGE, "de-de")
DEFINE_ACTION_STRING ("hide-content-disposition", ACTION_HIDE_CONTENT_DISPOSITION, ACTION_STRING_CONTENT_DISPOSITION)
DEFINE_CGI_PARAM_RADIO ("hide-content-disposition", ACTION_HIDE_CONTENT_DISPOSITION, ACTION_STRING_CONTENT_DISPOSITION, "block", 0)
DEFINE_CGI_PARAM_CUSTOM ("hide-content-disposition", ACTION_HIDE_CONTENT_DISPOSITION, ACTION_STRING_CONTENT_DISPOSITION, "attachment; filename=WHATEVER.txt")
DEFINE_ACTION_STRING ("hide-from-header", ACTION_HIDE_FROM, ACTION_STRING_FROM)
DEFINE_CGI_PARAM_RADIO ("hide-from-header", ACTION_HIDE_FROM, ACTION_STRING_FROM, "block", 1)
DEFINE_CGI_PARAM_CUSTOM ("hide-from-header", ACTION_HIDE_FROM, ACTION_STRING_FROM, "spam_me_senseless@sittingduck.xyz")
DEFINE_ACTION_STRING ("hide-if-modified-since", ACTION_HIDE_IF_MODIFIED_SINCE, ACTION_STRING_IF_MODIFIED_SINCE)
DEFINE_CGI_PARAM_RADIO ("hide-if-modified-since", ACTION_HIDE_IF_MODIFIED_SINCE, ACTION_STRING_IF_MODIFIED_SINCE, "block", 0)
DEFINE_CGI_PARAM_CUSTOM ("hide-if-modified-since", ACTION_HIDE_IF_MODIFIED_SINCE, ACTION_STRING_IF_MODIFIED_SINCE, "-1")
DEFINE_ACTION_STRING ("hide-referrer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER)
DEFINE_CGI_PARAM_RADIO ("hide-referrer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER, "conditional-forge", 3)
DEFINE_CGI_PARAM_RADIO ("hide-referrer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER, "conditional-block", 2)
DEFINE_CGI_PARAM_RADIO ("hide-referrer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER, "forge", 1)
DEFINE_CGI_PARAM_RADIO ("hide-referrer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER, "block", 0)
DEFINE_CGI_PARAM_CUSTOM ("hide-referrer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER, "http://www.privoxy.org/")
DEFINE_ACTION_STRING ("hide-user-agent", ACTION_HIDE_USER_AGENT, ACTION_STRING_USER_AGENT)
DEFINE_CGI_PARAM_NO_RADIO("hide-user-agent", ACTION_HIDE_USER_AGENT, ACTION_STRING_USER_AGENT, "Privoxy " VERSION)
DEFINE_ACTION_STRING ("limit-connect", ACTION_LIMIT_CONNECT, ACTION_STRING_LIMIT_CONNECT)
DEFINE_CGI_PARAM_NO_RADIO("limit-connect", ACTION_LIMIT_CONNECT, ACTION_STRING_LIMIT_CONNECT, "443")
DEFINE_ACTION_STRING ("limit-cookie-lifetime", ACTION_LIMIT_COOKIE_LIFETIME, ACTION_STRING_LIMIT_COOKIE_LIFETIME)
DEFINE_CGI_PARAM_CUSTOM ("limit-cookie-lifetime", ACTION_LIMIT_COOKIE_LIFETIME, ACTION_STRING_LIMIT_COOKIE_LIFETIME, "60")
DEFINE_ACTION_STRING ("overwrite-last-modified", ACTION_OVERWRITE_LAST_MODIFIED, ACTION_STRING_LAST_MODIFIED)
DEFINE_CGI_PARAM_RADIO ("overwrite-last-modified", ACTION_OVERWRITE_LAST_MODIFIED, ACTION_STRING_LAST_MODIFIED, "block", 0)
DEFINE_CGI_PARAM_RADIO ("overwrite-last-modified", ACTION_OVERWRITE_LAST_MODIFIED, ACTION_STRING_LAST_MODIFIED, "reset-to-request-time", 1)
DEFINE_CGI_PARAM_RADIO ("overwrite-last-modified", ACTION_OVERWRITE_LAST_MODIFIED, ACTION_STRING_LAST_MODIFIED, "randomize", 2)
DEFINE_ACTION_BOOL ("prevent-compression", ACTION_NO_COMPRESSION)
DEFINE_ACTION_STRING ("redirect", ACTION_REDIRECT, ACTION_STRING_REDIRECT)
DEFINE_CGI_PARAM_NO_RADIO("redirect", ACTION_REDIRECT, ACTION_STRING_REDIRECT, "http://localhost/")
DEFINE_ACTION_MULTI ("server-header-filter", ACTION_MULTI_SERVER_HEADER_FILTER)
DEFINE_ACTION_MULTI ("server-header-tagger", ACTION_MULTI_SERVER_HEADER_TAGGER)
DEFINE_ACTION_BOOL ("session-cookies-only", ACTION_SESSION_COOKIES_ONLY)
DEFINE_ACTION_STRING ("set-image-blocker", ACTION_IMAGE_BLOCKER, ACTION_STRING_IMAGE_BLOCKER)
DEFINE_CGI_PARAM_RADIO ("set-image-blocker", ACTION_IMAGE_BLOCKER, ACTION_STRING_IMAGE_BLOCKER, "pattern", 1)
DEFINE_CGI_PARAM_RADIO ("set-image-blocker", ACTION_IMAGE_BLOCKER, ACTION_STRING_IMAGE_BLOCKER, "blank", 0)
DEFINE_CGI_PARAM_CUSTOM ("set-image-blocker", ACTION_IMAGE_BLOCKER, ACTION_STRING_IMAGE_BLOCKER, CGI_PREFIX "send-banner?type=pattern")
#if DEFINE_ACTION_ALIAS
/*
* Alternative spellings
*/
DEFINE_ACTION_STRING ("hide-referer", ACTION_HIDE_REFERER, ACTION_STRING_REFERER)
DEFINE_ACTION_BOOL ("prevent-keeping-cookies", ACTION_SESSION_COOKIES_ONLY)
/*
* Pre-3.0.7 (pseudo) compatibility
*/
DEFINE_ACTION_MULTI ("filter-client-headers", ACTION_MULTI_CLIENT_HEADER_FILTER)
DEFINE_ACTION_MULTI ("filter-server-headers", ACTION_MULTI_SERVER_HEADER_FILTER)
#endif /* if DEFINE_ACTION_ALIAS */
#undef DEFINE_ACTION_MULTI
#undef DEFINE_ACTION_STRING
#undef DEFINE_ACTION_BOOL
#undef DEFINE_ACTION_ALIAS
#undef DEFINE_CGI_PARAM_CUSTOM
#undef DEFINE_CGI_PARAM_RADIO
#undef DEFINE_CGI_PARAM_NO_RADIO
privoxy-3.0.21-stable/./w32res.h 000640 001751 001751 00000007050 11630656300 015262 0 ustar 00fk fk 000000 000000 #ifndef W32RES_H_INCLUDED
#define W32RES_H_INCLUDED
#define W32RES_H_VERSION "$Id: w32res.h,v 1.20 2011/09/04 11:10:56 fabiankeil Exp $"
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/w32res.h,v $
*
* Purpose : Identifiers for Windows GUI resources.
*
* Copyright : Written by and Copyright (C) 2001-2002 members of
* the Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
* by and Copyright (C) 1997 Anonymous Coders and
* Junkbusters Corporation. http://www.junkbusters.com
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*********************************************************************/
#define IDR_TRAYMENU 101
#define IDI_IDLE 102
#define IDR_LOGVIEW 103
#define IDR_ACCELERATOR 104
#define IDR_POPUP_SELECTION 105
#define IDI_MAINICON 200
#define IDI_ANIMATED1 201
#define IDI_ANIMATED2 202
#define IDI_ANIMATED3 203
#define IDI_ANIMATED4 204
#define IDI_ANIMATED5 205
#define IDI_ANIMATED6 206
#define IDI_ANIMATED7 207
#define IDI_ANIMATED8 208
#define IDI_OFF 209
#define ID_TOGGLE_SHOWWINDOW 4000
#define ID_HELP_ABOUT 4001
#define ID_FILE_EXIT 4002
#define ID_VIEW_CLEARLOG 4003
#define ID_VIEW_LOGMESSAGES 4004
#define ID_VIEW_MESSAGEHIGHLIGHTING 4005
#define ID_VIEW_LIMITBUFFERSIZE 4006
#define ID_VIEW_ACTIVITYANIMATION 4007
#define ID_HELP_FAQ 4008
#define ID_HELP_MANUAL 4009
#define ID_HELP_GPL 4010
#define ID_HELP_STATUS 4011
#ifdef FEATURE_TOGGLE
#define ID_TOGGLE_ENABLED 4012
#endif /* def FEATURE_TOGGLE */
/* Break these out so they are easier to extend, but keep consecutive */
#define ID_TOOLS_EDITCONFIG 5000
#define ID_TOOLS_EDITDEFAULTACTIONS 5001
#define ID_TOOLS_EDITUSERACTIONS 5002
#define ID_TOOLS_EDITDEFAULTFILTERS 5003
#define ID_TOOLS_EDITUSERFILTERS 5004
#ifdef FEATURE_TRUST
#define ID_TOOLS_EDITTRUST 5005
#endif /* def FEATURE_TRUST */
#define ID_EDIT_COPY 30000
#endif /* ndef W32RES_H_INCLUDED */
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./miscutil.h 000640 001751 001751 00000007350 12054151171 015766 0 ustar 00fk fk 000000 000000 #ifndef MISCUTIL_H_INCLUDED
#define MISCUTIL_H_INCLUDED
#define MISCUTIL_H_VERSION "$Id: miscutil.h,v 1.37 2012/11/24 13:58:17 fabiankeil Exp $"
/*********************************************************************
*
* File : $Source: /cvsroot/ijbswa/current/miscutil.h,v $
*
* Purpose : zalloc, hash_string, strcmpic, strncmpic, and
* MinGW32 strdup functions. These are each too small
* to deserve their own file but don't really fit in
* any other file.
*
* Copyright : Written by and Copyright (C) 2001-2011 the
* Privoxy team. http://www.privoxy.org/
*
* Based on the Internet Junkbuster originally written
* by and Copyright (C) 1997 Anonymous Coders and
* Junkbusters Corporation. http://www.junkbusters.com
*
* This program is free software; you can redistribute it
* and/or modify it under the terms of the GNU General
* Public License as published by the Free Software
* Foundation; either version 2 of the License, or (at
* your option) any later version.
*
* This program is distributed in the hope that it will
* be useful, but WITHOUT ANY WARRANTY; without even the
* implied warranty of MERCHANTABILITY or FITNESS FOR A
* PARTICULAR PURPOSE. See the GNU General Public
* License for more details.
*
* The GNU General Public License should be included with
* this file. If not, you can view it at
* http://www.gnu.org/copyleft/gpl.html
* or write to the Free Software Foundation, Inc., 59
* Temple Place - Suite 330, Boston, MA 02111-1307, USA.
*
*********************************************************************/
#include "project.h"
#if defined(__cplusplus)
extern "C" {
#endif
extern const char *basedir;
extern void *zalloc(size_t size);
extern char *strdup_or_die(const char *str);
extern void *malloc_or_die(size_t buffer_size);
#if defined(unix)
extern void write_pid_file(void);
#endif /* unix */
extern unsigned int hash_string(const char* s);
extern int strcmpic(const char *s1, const char *s2);
extern int strncmpic(const char *s1, const char *s2, size_t n);
extern jb_err string_append(char **target_string, const char *text_to_append);
extern jb_err string_join (char **target_string, char *text_to_append);
extern char *string_toupper(const char *string);
extern void string_move(char *dst, char *src);
extern char *chomp(char *string);
extern char *bindup(const char *string, size_t len);
extern char *make_path(const char * dir, const char * file);
long int pick_from_range(long int range);
#ifndef HAVE_SNPRINTF
extern int snprintf(char *, size_t, const char *, /*args*/ ...);
#endif /* ndef HAVE_SNPRINTF */
#if !defined(HAVE_TIMEGM) && defined(HAVE_TZSET) && defined(HAVE_PUTENV)
time_t timegm(struct tm *tm);
#endif /* !defined(HAVE_TIMEGM) && defined(HAVE_TZSET) && defined(HAVE_PUTENV) */
/* Here's looking at you, Ulrich. */
#if !defined(HAVE_STRLCPY)
size_t privoxy_strlcpy(char *destination, const char *source, size_t size);
#define strlcpy privoxy_strlcpy
#define USE_PRIVOXY_STRLCPY 1
#define HAVE_STRLCPY 1
#endif /* ndef HAVE_STRLCPY*/
#ifndef HAVE_STRLCAT
size_t privoxy_strlcat(char *destination, const char *source, size_t size);
#define strlcat privoxy_strlcat
#endif /* ndef HAVE_STRLCAT */
/* Revision control strings from this header and associated .c file */
extern const char miscutil_rcs[];
extern const char miscutil_h_rcs[];
#if defined(__cplusplus)
}
#endif
#endif /* ndef MISCUTIL_H_INCLUDED */
/*
Local Variables:
tab-width: 3
end:
*/
privoxy-3.0.21-stable/./pcre/configure.in 000640 001751 001751 00000004124 10546014100 017214 0 ustar 00fk fk 000000 000000 dnl Process this file with autoconf to produce a configure script.
dnl This is required at the start; the name is the name of a file
dnl it should be seeing, to verify it is in the same directory.
AC_INIT(dftables.c)
dnl Arrange to build config.h from config.in. Note that pcre.h is
dnl built differently, as it is just a "substitution" file.
dnl Manual says this macro should come right after AC_INIT.
AC_CONFIG_HEADER(config.h:config.in)
dnl Provide the current PCRE version information. Do not use numbers
dnl with leading zeros for the minor version, as they end up in a C
dnl macro, and may be treated as octal constants. Stick to single
dnl digits for minor numbers less than 10. There are unlikely to be
dnl that many releases anyway.
PCRE_MAJOR=3
PCRE_MINOR=4
PCRE_DATE=22-Aug-2000
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
dnl Provide versioning information for libtool shared libraries that
dnl are built by default on Unix systems.
PCRE_LIB_VERSION=0:1:0
PCRE_POSIXLIB_VERSION=0:0:0
dnl Checks for programs.
AC_PROG_CC
AC_PROG_RANLIB
dnl Checks for header files.
AC_HEADER_STDC
AC_CHECK_HEADERS(limits.h)
dnl Checks for typedefs, structures, and compiler characteristics.
AC_C_CONST
AC_TYPE_SIZE_T
dnl Checks for library functions.
AC_CHECK_FUNCS(bcopy memmove strerror)
dnl Handle --enable-shared-libraries
LIBTOOL=./libtool
LIBSUFFIX=la
AC_ARG_ENABLE(shared,
[ --disable-shared build PCRE as a static library],
if test "$enableval" = "no"; then
LIBTOOL=
LIBSUFFIX=a
fi
)
dnl Handle --enable-utf8
AC_ARG_ENABLE(utf8,
[ --enable-utf8 enable UTF8 support (incomplete)],
if test "$enableval" = "yes"; then
UTF8=-DSUPPORT_UTF8
fi
)
dnl "Export" these variables
AC_SUBST(HAVE_MEMMOVE)
AC_SUBST(HAVE_STRERROR)
AC_SUBST(LIBTOOL)
AC_SUBST(LIBSUFFIX)
AC_SUBST(UTF8)
AC_SUBST(PCRE_MAJOR)
AC_SUBST(PCRE_MINOR)
AC_SUBST(PCRE_DATE)
AC_SUBST(PCRE_VERSION)
AC_SUBST(PCRE_LIB_VERSION)
AC_SUBST(PCRE_POSIXLIB_VERSION)
dnl This must be last; it determines what files are written
AC_OUTPUT(Makefile pcre.h:pcre.in pcre-config:pcre-config.in RunTest:RunTest.in,[chmod a+x RunTest pcre-config])
privoxy-3.0.21-stable/./pcre/pcre.c 000640 001751 001751 00000451021 10546014100 016002 0 ustar 00fk fk 000000 000000 /*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/*
This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
the file Tech.Notes for some information on the internals.
Written by: Philip Hazel
Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
-----------------------------------------------------------------------------
*/
/* Define DEBUG to get debugging output on stdout. */
/* #define DEBUG */
/* Use a macro for debugging printing, 'cause that eliminates the use of #ifdef
inline, and there are *still* stupid compilers about that don't like indented
pre-processor statements. I suppose it's only been 10 years... */
#ifdef DEBUG
#define DPRINTF(p) printf p
#else
#define DPRINTF(p) /*nothing*/
#endif
/* Include the internals header, which itself includes Standard C headers plus
the external pcre header. */
#include "internal.h"
/* Allow compilation as C++ source code, should anybody want to do that. */
#ifdef __cplusplus
#define class pcre_class
#endif
/* Number of items on the nested bracket stacks at compile time. This should
not be set greater than 200. */
#define BRASTACK_SIZE 200
/* The number of bytes in a literal character string above which we can't add
any more is different when UTF-8 characters may be encountered. */
#ifdef SUPPORT_UTF8
#define MAXLIT 250
#else
#define MAXLIT 255
#endif
/* Min and max values for the common repeats; for the maxima, 0 => infinity */
static const char rep_min[] = { 0, 0, 1, 1, 0, 0 };
static const char rep_max[] = { 0, 0, 0, 0, 1, 1 };
/* Text forms of OP_ values and things, for debugging (not all used) */
#ifdef DEBUG
static const char *OP_names[] = {
"End", "\\A", "\\B", "\\b", "\\D", "\\d",
"\\S", "\\s", "\\W", "\\w", "\\Z", "\\z",
"Opt", "^", "$", "Any", "chars", "not",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{", "{",
"*", "*?", "+", "+?", "?", "??", "{", "{",
"class", "Ref", "Recurse",
"Alt", "Ket", "KetRmax", "KetRmin", "Assert", "Assert not",
"AssertB", "AssertB not", "Reverse", "Once", "Cond", "Cref",
"Brazero", "Braminzero", "Bra"
};
#endif
/* Table for handling escaped characters in the range '0'-'z'. Positive returns
are simple data values; negative values are for special things like \d and so
on. Zero means further processing is needed (for things like \x), or the escape
is invalid. */
static const short int escapes[] = {
0, 0, 0, 0, 0, 0, 0, 0, /* 0 - 7 */
0, 0, ':', ';', '<', '=', '>', '?', /* 8 - ? */
'@', -ESC_A, -ESC_B, 0, -ESC_D, 0, 0, 0, /* @ - G */
0, 0, 0, 0, 0, 0, 0, 0, /* H - O */
0, 0, 0, -ESC_S, 0, 0, 0, -ESC_W, /* P - W */
0, 0, -ESC_Z, '[', '\\', ']', '^', '_', /* X - _ */
'`', 7, -ESC_b, 0, -ESC_d, 27, '\f', 0, /* ` - g */
0, 0, 0, 0, 0, 0, '\n', 0, /* h - o */
0, 0, '\r', -ESC_s, '\t', 0, 0, -ESC_w, /* p - w */
0, 0, -ESC_z /* x - z */
};
/* Tables of names of POSIX character classes and their lengths. The list is
terminated by a zero length entry. The first three must be alpha, upper, lower,
as this is assumed for handling case independence. */
static const char *posix_names[] = {
"alpha", "lower", "upper",
"alnum", "ascii", "cntrl", "digit", "graph",
"print", "punct", "space", "word", "xdigit" };
static const uschar posix_name_lengths[] = {
5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 4, 6, 0 };
/* Table of class bit maps for each POSIX class; up to three may be combined
to form the class. */
static const int posix_class_maps[] = {
cbit_lower, cbit_upper, -1, /* alpha */
cbit_lower, -1, -1, /* lower */
cbit_upper, -1, -1, /* upper */
cbit_digit, cbit_lower, cbit_upper, /* alnum */
cbit_print, cbit_cntrl, -1, /* ascii */
cbit_cntrl, -1, -1, /* cntrl */
cbit_digit, -1, -1, /* digit */
cbit_graph, -1, -1, /* graph */
cbit_print, -1, -1, /* print */
cbit_punct, -1, -1, /* punct */
cbit_space, -1, -1, /* space */
cbit_word, -1, -1, /* word */
cbit_xdigit,-1, -1 /* xdigit */
};
/* Definition to allow mutual recursion */
static BOOL
compile_regex(int, int, int *, uschar **, const uschar **, const char **,
BOOL, int, int *, int *, compile_data *);
/* Structure for building a chain of data that actually lives on the
stack, for holding the values of the subject pointer at the start of each
subpattern, so as to detect when an empty string has been matched by a
subpattern - to break infinite loops. */
typedef struct eptrblock {
struct eptrblock *prev;
const uschar *saved_eptr;
} eptrblock;
/* Flag bits for the match() function */
#define match_condassert 0x01 /* Called to check a condition assertion */
#define match_isgroup 0x02 /* Set if start of bracketed group */
/*************************************************
* Global variables *
*************************************************/
/* PCRE is thread-clean and doesn't use any global variables in the normal
sense. However, it calls memory allocation and free functions via the two
indirections below, which are can be changed by the caller, but are shared
between all threads. */
void *(*pcre_malloc)(size_t) = malloc;
void (*pcre_free)(void *) = free;
/*************************************************
* Macros and tables for character handling *
*************************************************/
/* When UTF-8 encoding is being used, a character is no longer just a single
byte. The macros for character handling generate simple sequences when used in
byte-mode, and more complicated ones for UTF-8 characters. */
#ifndef SUPPORT_UTF8
#define GETCHARINC(c, eptr) c = *eptr++;
#define GETCHARLEN(c, eptr, len) c = *eptr;
#define BACKCHAR(eptr)
#else /* SUPPORT_UTF8 */
/* Get the next UTF-8 character, advancing the pointer */
#define GETCHARINC(c, eptr) \
c = *eptr++; \
if (md->utf8 && (c & 0xc0) == 0xc0) \
{ \
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int s = 6 - a; /* Amount to shift next byte */ \
c &= utf8_table3[a]; /* Low order bits from first byte */ \
while (a-- > 0) \
{ \
c |= (*eptr++ & 0x3f) << s; \
s += 6; \
} \
}
/* Get the next UTF-8 character, not advancing the pointer, setting length */
#define GETCHARLEN(c, eptr, len) \
c = *eptr; \
len = 1; \
if (md->utf8 && (c & 0xc0) == 0xc0) \
{ \
int i; \
int a = utf8_table4[c & 0x3f]; /* Number of additional bytes */ \
int s = 6 - a; /* Amount to shift next byte */ \
c &= utf8_table3[a]; /* Low order bits from first byte */ \
for (i = 1; i <= a; i++) \
{ \
c |= (eptr[i] & 0x3f) << s; \
s += 6; \
} \
len += a; \
}
/* If the pointer is not at the start of a character, move it back until
it is. */
#define BACKCHAR(eptr) while((*eptr & 0xc0) == 0x80) eptr--;
#endif
/*************************************************
* Default character tables *
*************************************************/
/* A default set of character tables is included in the PCRE binary. Its source
is built by the maketables auxiliary program, which uses the default C ctypes
functions, and put in the file chartables.c. These tables are used by PCRE
whenever the caller of pcre_compile() does not provide an alternate set of
tables. */
#include "chartables.c"
#ifdef SUPPORT_UTF8
/*************************************************
* Tables for UTF-8 support *
*************************************************/
/* These are the breakpoints for different numbers of bytes in a UTF-8
character. */
static int utf8_table1[] = { 0x7f, 0x7ff, 0xffff, 0x1fffff, 0x3ffffff, 0x7fffffff};
/* These are the indicator bits and the mask for the data bits to set in the
first byte of a character, indexed by the number of additional bytes. */
static int utf8_table2[] = { 0, 0xc0, 0xe0, 0xf0, 0xf8, 0xfc};
static int utf8_table3[] = { 0xff, 0x1f, 0x0f, 0x07, 0x03, 0x01};
/* Table of the number of extra characters, indexed by the first character
masked with 0x3f. The highest number for a valid UTF-8 character is in fact
0x3d. */
static uschar utf8_table4[] = {
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,
2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
3,3,3,3,3,3,3,3,4,4,4,4,5,5,5,5 };
/*************************************************
* Convert character value to UTF-8 *
*************************************************/
/* This function takes an integer value in the range 0 - 0x7fffffff
and encodes it as a UTF-8 character in 0 to 6 bytes.
Arguments:
cvalue the character value
buffer pointer to buffer for result - at least 6 bytes long
Returns: number of characters placed in the buffer
*/
static int
ord2utf8(int cvalue, uschar *buffer)
{
register int i, j;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (cvalue <= utf8_table1[i]) break;
*buffer++ = utf8_table2[i] | (cvalue & utf8_table3[i]);
cvalue >>= 6 - i;
for (j = 0; j < i; j++)
{
*buffer++ = 0x80 | (cvalue & 0x3f);
cvalue >>= 6;
}
return i + 1;
}
#endif
/*************************************************
* Return version string *
*************************************************/
#define STRING(a) # a
#define XSTRING(s) STRING(s)
const char *
pcre_version(void)
{
return XSTRING(PCRE_MAJOR) "." XSTRING(PCRE_MINOR) " " XSTRING(PCRE_DATE);
}
/*************************************************
* (Obsolete) Return info about compiled pattern *
*************************************************/
/* This is the original "info" function. It picks potentially useful data out
of the private structure, but its interface was too rigid. It remains for
backwards compatibility. The public options are passed back in an int - though
the re->options field has been expanded to a long int, all the public options
at the low end of it, and so even on 16-bit systems this will still be OK.
Therefore, I haven't changed the API for pcre_info().
Arguments:
external_re points to compiled code
optptr where to pass back the options
first_char where to pass back the first character,
or -1 if multiline and all branches start ^,
or -2 otherwise
Returns: number of capturing subpatterns
or negative values on error
*/
int
pcre_info(const pcre *external_re, int *optptr, int *first_char)
{
const real_pcre *re = (const real_pcre *)external_re;
if (re == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
if (optptr != NULL) *optptr = (int)(re->options & PUBLIC_OPTIONS);
if (first_char != NULL)
*first_char = ((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
return re->top_bracket;
}
/*************************************************
* Return info about compiled pattern *
*************************************************/
/* This is a newer "info" function which has an extensible interface so
that additional items can be added compatibly.
Arguments:
external_re points to compiled code
external_study points to study data, or NULL
what what information is required
where where to put the information
Returns: 0 if data returned, negative on error
*/
int
pcre_fullinfo(const pcre *external_re, const pcre_extra *study_data, int what,
void *where)
{
const real_pcre *re = (const real_pcre *)external_re;
const real_pcre_extra *study = (const real_pcre_extra *)study_data;
if (re == NULL || where == NULL) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
switch (what)
{
case PCRE_INFO_OPTIONS:
*((unsigned long int *)where) = re->options & PUBLIC_OPTIONS;
break;
case PCRE_INFO_SIZE:
*((size_t *)where) = re->size;
break;
case PCRE_INFO_CAPTURECOUNT:
*((int *)where) = re->top_bracket;
break;
case PCRE_INFO_BACKREFMAX:
*((int *)where) = re->top_backref;
break;
case PCRE_INFO_FIRSTCHAR:
*((int *)where) =
((re->options & PCRE_FIRSTSET) != 0)? re->first_char :
((re->options & PCRE_STARTLINE) != 0)? -1 : -2;
break;
case PCRE_INFO_FIRSTTABLE:
*((const uschar **)where) =
(study != NULL && (study->options & PCRE_STUDY_MAPPED) != 0)?
study->start_bits : NULL;
break;
case PCRE_INFO_LASTLITERAL:
*((int *)where) =
((re->options & PCRE_REQCHSET) != 0)? re->req_char : -1;
break;
default: return PCRE_ERROR_BADOPTION;
}
return 0;
}
#ifdef DEBUG
/*************************************************
* Debugging function to print chars *
*************************************************/
/* Print a sequence of chars in printable format, stopping at the end of the
subject if the requested.
Arguments:
p points to characters
length number to print
is_subject TRUE if printing from within md->start_subject
md pointer to matching data block, if is_subject is TRUE
Returns: nothing
*/
static void
pchars(const uschar *p, int length, BOOL is_subject, match_data *md)
{
int c;
if (is_subject && length > md->end_subject - p) length = md->end_subject - p;
while (length-- > 0)
if (isprint(c = *(p++))) printf("%c", c); else printf("\\x%02x", c);
}
#endif
/*************************************************
* Handle escapes *
*************************************************/
/* This function is called when a \ has been encountered. It either returns a
positive value for a simple escape such as \n, or a negative value which
encodes one of the more complicated things such as \d. When UTF-8 is enabled,
a positive value greater than 255 may be returned. On entry, ptr is pointing at
the \. On exit, it is on the final character of the escape sequence.
Arguments:
ptrptr points to the pattern position pointer
errorptr points to the pointer to the error message
bracount number of previous extracting brackets
options the options bits
isclass TRUE if inside a character class
cd pointer to char tables block
Returns: zero or positive => a data character
negative => a special escape sequence
on error, errorptr is set
*/
static int
check_escape(const uschar **ptrptr, const char **errorptr, int bracount,
int options, BOOL isclass, compile_data *cd)
{
const uschar *ptr = *ptrptr;
int c, i;
/* If backslash is at the end of the pattern, it's an error. */
c = *(++ptr);
if (c == 0) *errorptr = ERR1;
/* Digits or letters may have special meaning; all others are literals. */
else if (c < '0' || c > 'z') {}
/* Do an initial lookup in a table. A non-zero result is something that can be
returned immediately. Otherwise further processing may be required. */
else if ((i = escapes[c - '0']) != 0) c = i;
/* Escapes that need further processing, or are illegal. */
else
{
const uschar *oldptr;
switch (c)
{
/* The handling of escape sequences consisting of a string of digits
starting with one that is not zero is not straightforward. By experiment,
the way Perl works seems to be as follows:
Outside a character class, the digits are read as a decimal number. If the
number is less than 10, or if there are that many previous extracting
left brackets, then it is a back reference. Otherwise, up to three octal
digits are read to form an escaped byte. Thus \123 is likely to be octal
123 (cf \0123, which is octal 012 followed by the literal 3). If the octal
value is greater than 377, the least significant 8 bits are taken. Inside a
character class, \ followed by a digit is always an octal number. */
case '1': case '2': case '3': case '4': case '5':
case '6': case '7': case '8': case '9':
if (!isclass)
{
oldptr = ptr;
c -= '0';
while ((cd->ctypes[ptr[1]] & ctype_digit) != 0)
c = c * 10 + *(++ptr) - '0';
if (c < 10 || c <= bracount)
{
c = -(ESC_REF + c);
break;
}
ptr = oldptr; /* Put the pointer back and fall through */
}
/* Handle an octal number following \. If the first digit is 8 or 9, Perl
generates a binary zero byte and treats the digit as a following literal.
Thus we have to pull back the pointer by one. */
if ((c = *ptr) >= '8')
{
ptr--;
c = 0;
break;
}
/* \0 always starts an octal number, but we may drop through to here with a
larger first octal digit. */
case '0':
c -= '0';
while(i++ < 2 && (cd->ctypes[ptr[1]] & ctype_digit) != 0 &&
ptr[1] != '8' && ptr[1] != '9')
c = c * 8 + *(++ptr) - '0';
c &= 255; /* Take least significant 8 bits */
break;
/* \x is complicated when UTF-8 is enabled. \x{ddd} is a character number
which can be greater than 0xff, but only if the ddd are hex digits. */
case 'x':
#ifdef SUPPORT_UTF8
if (ptr[1] == '{' && (options & PCRE_UTF8) != 0)
{
const uschar *pt = ptr + 2;
register int count = 0;
c = 0;
while ((cd->ctypes[*pt] & ctype_xdigit) != 0)
{
count++;
c = c * 16 + cd->lcc[*pt] -
(((cd->ctypes[*pt] & ctype_digit) != 0)? '0' : 'W');
pt++;
}
if (*pt == '}')
{
if (c < 0 || count > 8) *errorptr = ERR34;
ptr = pt;
break;
}
/* If the sequence of hex digits does not end with '}', then we don't
recognize this construct; fall through to the normal \x handling. */
}
#endif
/* Read just a single hex char */
c = 0;
while (i++ < 2 && (cd->ctypes[ptr[1]] & ctype_xdigit) != 0)
{
ptr++;
c = c * 16 + cd->lcc[*ptr] -
(((cd->ctypes[*ptr] & ctype_digit) != 0)? '0' : 'W');
}
break;
/* Other special escapes not starting with a digit are straightforward */
case 'c':
c = *(++ptr);
if (c == 0)
{
*errorptr = ERR2;
return 0;
}
/* A letter is upper-cased; then the 0x40 bit is flipped */
if (c >= 'a' && c <= 'z') c = cd->fcc[c];
c ^= 0x40;
break;
/* PCRE_EXTRA enables extensions to Perl in the matter of escapes. Any
other alphameric following \ is an error if PCRE_EXTRA was set; otherwise,
for Perl compatibility, it is a literal. This code looks a bit odd, but
there used to be some cases other than the default, and there may be again
in future, so I haven't "optimized" it. */
default:
if ((options & PCRE_EXTRA) != 0) switch(c)
{
default:
*errorptr = ERR3;
break;
}
break;
}
}
*ptrptr = ptr;
return c;
}
/*************************************************
* Check for counted repeat *
*************************************************/
/* This function is called when a '{' is encountered in a place where it might
start a quantifier. It looks ahead to see if it really is a quantifier or not.
It is only a quantifier if it is one of the forms {ddd} {ddd,} or {ddd,ddd}
where the ddds are digits.
Arguments:
p pointer to the first char after '{'
cd pointer to char tables block
Returns: TRUE or FALSE
*/
static BOOL
is_counted_repeat(const uschar *p, compile_data *cd)
{
if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
if (*p == '}') return TRUE;
if (*p++ != ',') return FALSE;
if (*p == '}') return TRUE;
if ((cd->ctypes[*p++] & ctype_digit) == 0) return FALSE;
while ((cd->ctypes[*p] & ctype_digit) != 0) p++;
return (*p == '}');
}
/*************************************************
* Read repeat counts *
*************************************************/
/* Read an item of the form {n,m} and return the values. This is called only
after is_counted_repeat() has confirmed that a repeat-count quantifier exists,
so the syntax is guaranteed to be correct, but we need to check the values.
Arguments:
p pointer to first char after '{'
minp pointer to int for min
maxp pointer to int for max
returned as -1 if no max
errorptr points to pointer to error message
cd pointer to character tables clock
Returns: pointer to '}' on success;
current ptr on error, with errorptr set
*/
static const uschar *
read_repeat_counts(const uschar *p, int *minp, int *maxp,
const char **errorptr, compile_data *cd)
{
int min = 0;
int max = -1;
while ((cd->ctypes[*p] & ctype_digit) != 0) min = min * 10 + *p++ - '0';
if (*p == '}') max = min; else
{
if (*(++p) != '}')
{
max = 0;
while((cd->ctypes[*p] & ctype_digit) != 0) max = max * 10 + *p++ - '0';
if (max < min)
{
*errorptr = ERR4;
return p;
}
}
}
/* Do paranoid checks, then fill in the required variables, and pass back the
pointer to the terminating '}'. */
if (min > 65535 || max > 65535)
*errorptr = ERR5;
else
{
*minp = min;
*maxp = max;
}
return p;
}
/*************************************************
* Find the fixed length of a pattern *
*************************************************/
/* Scan a pattern and compute the fixed length of subject that will match it,
if the length is fixed. This is needed for dealing with backward assertions.
Arguments:
code points to the start of the pattern (the bracket)
options the compiling options
Returns: the fixed length, or -1 if there is no fixed length
*/
static int
find_fixedlength(uschar *code, int options)
{
int length = -1;
register int branchlength = 0;
register uschar *cc = code + 3;
/* Scan along the opcodes for this branch. If we get to the end of the
branch, check the length against that of the other branches. */
for (;;)
{
int d;
register int op = *cc;
if (op >= OP_BRA) op = OP_BRA;
switch (op)
{
case OP_BRA:
case OP_ONCE:
case OP_COND:
d = find_fixedlength(cc, options);
if (d < 0) return -1;
branchlength += d;
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
cc += 3;
break;
/* Reached end of a branch; if it's a ket it is the end of a nested
call. If it's ALT it is an alternation in a nested call. If it is
END it's the end of the outer call. All can be handled by the same code. */
case OP_ALT:
case OP_KET:
case OP_KETRMAX:
case OP_KETRMIN:
case OP_END:
if (length < 0) length = branchlength;
else if (length != branchlength) return -1;
if (*cc != OP_ALT) return length;
cc += 3;
branchlength = 0;
break;
/* Skip over assertive subpatterns */
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do cc += (cc[1] << 8) + cc[2]; while (*cc == OP_ALT);
cc += 3;
break;
/* Skip over things that don't match chars */
case OP_REVERSE:
cc++;
/* Fall through */
case OP_CREF:
case OP_OPT:
cc++;
/* Fall through */
case OP_SOD:
case OP_EOD:
case OP_EODN:
case OP_CIRC:
case OP_DOLL:
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
cc++;
break;
/* Handle char strings. In UTF-8 mode we must count characters, not bytes.
This requires a scan of the string, unfortunately. We assume valid UTF-8
strings, so all we do is reduce the length by one for byte whose bits are
10xxxxxx. */
case OP_CHARS:
branchlength += *(++cc);
#ifdef SUPPORT_UTF8
for (d = 1; d <= *cc; d++)
if ((cc[d] & 0xc0) == 0x80) branchlength--;
#endif
cc += *cc + 1;
break;
/* Handle exact repetitions */
case OP_EXACT:
case OP_TYPEEXACT:
branchlength += (cc[1] << 8) + cc[2];
cc += 4;
break;
/* Handle single-char matchers */
case OP_NOT_DIGIT:
case OP_DIGIT:
case OP_NOT_WHITESPACE:
case OP_WHITESPACE:
case OP_NOT_WORDCHAR:
case OP_WORDCHAR:
case OP_ANY:
branchlength++;
cc++;
break;
/* Check a class for variable quantification */
case OP_CLASS:
cc += (*cc == OP_REF)? 2 : 33;
switch (*cc)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
return -1;
case OP_CRRANGE:
case OP_CRMINRANGE:
if ((cc[1] << 8) + cc[2] != (cc[3] << 8) + cc[4]) return -1;
branchlength += (cc[1] << 8) + cc[2];
cc += 5;
break;
default:
branchlength++;
}
break;
/* Anything else is variable length */
default:
return -1;
}
}
/* Control never gets here */
}
/*************************************************
* Check for POSIX class syntax *
*************************************************/
/* This function is called when the sequence "[:" or "[." or "[=" is
encountered in a character class. It checks whether this is followed by an
optional ^ and then a sequence of letters, terminated by a matching ":]" or
".]" or "=]".
Argument:
ptr pointer to the initial [
endptr where to return the end pointer
cd pointer to compile data
Returns: TRUE or FALSE
*/
static BOOL
check_posix_syntax(const uschar *ptr, const uschar **endptr, compile_data *cd)
{
int terminator; /* Don't combine these lines; the Solaris cc */
terminator = *(++ptr); /* compiler warns about "non-constant" initializer. */
if (*(++ptr) == '^') ptr++;
while ((cd->ctypes[*ptr] & ctype_letter) != 0) ptr++;
if (*ptr == terminator && ptr[1] == ']')
{
*endptr = ptr;
return TRUE;
}
return FALSE;
}
/*************************************************
* Check POSIX class name *
*************************************************/
/* This function is called to check the name given in a POSIX-style class entry
such as [:alnum:].
Arguments:
ptr points to the first letter
len the length of the name
Returns: a value representing the name, or -1 if unknown
*/
static int
check_posix_name(const uschar *ptr, int len)
{
register int yield = 0;
while (posix_name_lengths[yield] != 0)
{
if (len == posix_name_lengths[yield] &&
strncmp((const char *)ptr, posix_names[yield], len) == 0) return yield;
yield++;
}
return -1;
}
/*************************************************
* Compile one branch *
*************************************************/
/* Scan the pattern, compiling it into the code vector.
Arguments:
options the option bits
brackets points to number of brackets used
code points to the pointer to the current code point
ptrptr points to the current pattern pointer
errorptr points to pointer to error message
optchanged set to the value of the last OP_OPT item compiled
reqchar set to the last literal character required, else -1
countlits set to count of mandatory literal characters
cd contains pointers to tables
Returns: TRUE on success
FALSE, with *errorptr set on error
*/
static BOOL
compile_branch(int options, int *brackets, uschar **codeptr,
const uschar **ptrptr, const char **errorptr, int *optchanged,
int *reqchar, int *countlits, compile_data *cd)
{
int repeat_type, op_type;
int repeat_min, repeat_max;
int bravalue, length;
int greedy_default, greedy_non_default;
int prevreqchar;
int condcount = 0;
int subcountlits = 0;
register int c;
register uschar *code = *codeptr;
uschar *tempcode;
const uschar *ptr = *ptrptr;
const uschar *tempptr;
uschar *previous = NULL;
uschar class[32];
/* Set up the default and non-default settings for greediness */
greedy_default = ((options & PCRE_UNGREEDY) != 0);
greedy_non_default = greedy_default ^ 1;
/* Initialize no required char, and count of literals */
*reqchar = prevreqchar = -1;
*countlits = 0;
/* Switch on next character until the end of the branch */
for (;; ptr++)
{
BOOL negate_class;
int class_charcount;
int class_lastchar;
int newoptions;
int condref;
int subreqchar;
c = *ptr;
if ((options & PCRE_EXTENDED) != 0)
{
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
on the Macintosh. */
while ((c = *(++ptr)) != 0 && c != '\n') ;
continue;
}
}
switch(c)
{
/* The branch terminates at end of string, |, or ). */
case 0:
case '|':
case ')':
*codeptr = code;
*ptrptr = ptr;
return TRUE;
/* Handle single-character metacharacters */
case '^':
previous = NULL;
*code++ = OP_CIRC;
break;
case '$':
previous = NULL;
*code++ = OP_DOLL;
break;
case '.':
previous = code;
*code++ = OP_ANY;
break;
/* Character classes. These always build a 32-byte bitmap of the permitted
characters, except in the special case where there is only one character.
For negated classes, we build the map as usual, then invert it at the end.
*/
case '[':
previous = code;
*code++ = OP_CLASS;
/* If the first character is '^', set the negation flag and skip it. */
if ((c = *(++ptr)) == '^')
{
negate_class = TRUE;
c = *(++ptr);
}
else negate_class = FALSE;
/* Keep a count of chars so that we can optimize the case of just a single
character. */
class_charcount = 0;
class_lastchar = -1;
/* Initialize the 32-char bit map to all zeros. We have to build the
map in a temporary bit of store, in case the class contains only 1
character, because in that case the compiled code doesn't use the
bit map. */
memset(class, 0, 32 * sizeof(uschar));
/* Process characters until ] is reached. By writing this as a "do" it
means that an initial ] is taken as a data character. */
do
{
if (c == 0)
{
*errorptr = ERR6;
goto FAILED;
}
/* Handle POSIX class names. Perl allows a negation extension of the
form [:^name]. A square bracket that doesn't match the syntax is
treated as a literal. We also recognize the POSIX constructions
[.ch.] and [=ch=] ("collating elements") and fault them, as Perl
5.6 does. */
if (c == '[' &&
(ptr[1] == ':' || ptr[1] == '.' || ptr[1] == '=') &&
check_posix_syntax(ptr, &tempptr, cd))
{
BOOL local_negate = FALSE;
int posix_class, i;
register const uschar *cbits = cd->cbits;
if (ptr[1] != ':')
{
*errorptr = ERR31;
goto FAILED;
}
ptr += 2;
if (*ptr == '^')
{
local_negate = TRUE;
ptr++;
}
posix_class = check_posix_name(ptr, tempptr - ptr);
if (posix_class < 0)
{
*errorptr = ERR30;
goto FAILED;
}
/* If matching is caseless, upper and lower are converted to
alpha. This relies on the fact that the class table starts with
alpha, lower, upper as the first 3 entries. */
if ((options & PCRE_CASELESS) != 0 && posix_class <= 2)
posix_class = 0;
/* Or into the map we are building up to 3 of the static class
tables, or their negations. */
posix_class *= 3;
for (i = 0; i < 3; i++)
{
int taboffset = posix_class_maps[posix_class + i];
if (taboffset < 0) break;
if (local_negate)
for (c = 0; c < 32; c++) class[c] |= ~cbits[c+taboffset];
else
for (c = 0; c < 32; c++) class[c] |= cbits[c+taboffset];
}
ptr = tempptr + 1;
class_charcount = 10; /* Set > 1; assumes more than 1 per class */
continue;
}
/* Backslash may introduce a single character, or it may introduce one
of the specials, which just set a flag. Escaped items are checked for
validity in the pre-compiling pass. The sequence \b is a special case.
Inside a class (and only there) it is treated as backspace. Elsewhere
it marks a word boundary. Other escapes have preset maps ready to
or into the one we are building. We assume they have more than one
character in them, so set class_count bigger than one. */
if (c == '\\')
{
c = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
if (-c == ESC_b) c = '\b';
else if (c < 0)
{
register const uschar *cbits = cd->cbits;
class_charcount = 10;
switch (-c)
{
case ESC_d:
for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_digit];
continue;
case ESC_D:
for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_digit];
continue;
case ESC_w:
for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_word];
continue;
case ESC_W:
for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_word];
continue;
case ESC_s:
for (c = 0; c < 32; c++) class[c] |= cbits[c+cbit_space];
continue;
case ESC_S:
for (c = 0; c < 32; c++) class[c] |= ~cbits[c+cbit_space];
continue;
default:
*errorptr = ERR7;
goto FAILED;
}
}
/* Fall through if single character, but don't at present allow
chars > 255 in UTF-8 mode. */
#ifdef SUPPORT_UTF8
if (c > 255)
{
*errorptr = ERR33;
goto FAILED;
}
#endif
}
/* A single character may be followed by '-' to form a range. However,
Perl does not permit ']' to be the end of the range. A '-' character
here is treated as a literal. */
if (ptr[1] == '-' && ptr[2] != ']')
{
int d;
ptr += 2;
d = *ptr;
if (d == 0)
{
*errorptr = ERR6;
goto FAILED;
}
/* The second part of a range can be a single-character escape, but
not any of the other escapes. Perl 5.6 treats a hyphen as a literal
in such circumstances. */
if (d == '\\')
{
const uschar *oldptr = ptr;
d = check_escape(&ptr, errorptr, *brackets, options, TRUE, cd);
#ifdef SUPPORT_UTF8
if (d > 255)
{
*errorptr = ERR33;
goto FAILED;
}
#endif
/* \b is backslash; any other special means the '-' was literal */
if (d < 0)
{
if (d == -ESC_b) d = '\b'; else
{
ptr = oldptr - 2;
goto SINGLE_CHARACTER; /* A few lines below */
}
}
}
if (d < c)
{
*errorptr = ERR8;
goto FAILED;
}
for (; c <= d; c++)
{
class[c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
int uc = cd->fcc[c]; /* flip case */
class[uc/8] |= (1 << (uc&7));
}
class_charcount++; /* in case a one-char range */
class_lastchar = c;
}
continue; /* Go get the next char in the class */
}
/* Handle a lone single character - we can get here for a normal
non-escape char, or after \ that introduces a single character. */
SINGLE_CHARACTER:
class [c/8] |= (1 << (c&7));
if ((options & PCRE_CASELESS) != 0)
{
c = cd->fcc[c]; /* flip case */
class[c/8] |= (1 << (c&7));
}
class_charcount++;
class_lastchar = c;
}
/* Loop until ']' reached; the check for end of string happens inside the
loop. This "while" is the end of the "do" above. */
while ((c = *(++ptr)) != ']');
/* If class_charcount is 1 and class_lastchar is not negative, we saw
precisely one character. This doesn't need the whole 32-byte bit map.
We turn it into a 1-character OP_CHAR if it's positive, or OP_NOT if
it's negative. */
if (class_charcount == 1 && class_lastchar >= 0)
{
if (negate_class)
{
code[-1] = OP_NOT;
}
else
{
code[-1] = OP_CHARS;
*code++ = 1;
}
*code++ = class_lastchar;
}
/* Otherwise, negate the 32-byte map if necessary, and copy it into
the code vector. */
else
{
if (negate_class)
for (c = 0; c < 32; c++) code[c] = ~class[c];
else
memcpy(code, class, 32);
code += 32;
}
break;
/* Various kinds of repeat */
case '{':
if (!is_counted_repeat(ptr+1, cd)) goto NORMAL_CHAR;
ptr = read_repeat_counts(ptr+1, &repeat_min, &repeat_max, errorptr, cd);
if (*errorptr != NULL) goto FAILED;
goto REPEAT;
case '*':
repeat_min = 0;
repeat_max = -1;
goto REPEAT;
case '+':
repeat_min = 1;
repeat_max = -1;
goto REPEAT;
case '?':
repeat_min = 0;
repeat_max = 1;
REPEAT:
if (previous == NULL)
{
*errorptr = ERR9;
goto FAILED;
}
/* If the next character is '?' this is a minimizing repeat, by default,
but if PCRE_UNGREEDY is set, it works the other way round. Advance to the
next character. */
if (ptr[1] == '?')
{ repeat_type = greedy_non_default; ptr++; }
else repeat_type = greedy_default;
/* If previous was a string of characters, chop off the last one and use it
as the subject of the repeat. If there was only one character, we can
abolish the previous item altogether. A repeat with a zero minimum wipes
out any reqchar setting, backing up to the previous value. We must also
adjust the countlits value. */
if (*previous == OP_CHARS)
{
int len = previous[1];
if (repeat_min == 0) *reqchar = prevreqchar;
*countlits += repeat_min - 1;
if (len == 1)
{
c = previous[2];
code = previous;
}
else
{
c = previous[len+1];
previous[1]--;
code--;
}
op_type = 0; /* Use single-char op codes */
goto OUTPUT_SINGLE_REPEAT; /* Code shared with single character types */
}
/* If previous was a single negated character ([^a] or similar), we use
one of the special opcodes, replacing it. The code is shared with single-
character repeats by adding a suitable offset into repeat_type. */
else if ((int)*previous == OP_NOT)
{
op_type = OP_NOTSTAR - OP_STAR; /* Use "not" opcodes */
c = previous[1];
code = previous;
goto OUTPUT_SINGLE_REPEAT;
}
/* If previous was a character type match (\d or similar), abolish it and
create a suitable repeat item. The code is shared with single-character
repeats by adding a suitable offset into repeat_type. */
else if ((int)*previous < OP_EODN || *previous == OP_ANY)
{
op_type = OP_TYPESTAR - OP_STAR; /* Use type opcodes */
c = *previous;
code = previous;
OUTPUT_SINGLE_REPEAT:
/* If the maximum is zero then the minimum must also be zero; Perl allows
this case, so we do too - by simply omitting the item altogether. */
if (repeat_max == 0) goto END_REPEAT;
/* Combine the op_type with the repeat_type */
repeat_type += op_type;
/* A minimum of zero is handled either as the special case * or ?, or as
an UPTO, with the maximum given. */
if (repeat_min == 0)
{
if (repeat_max == -1) *code++ = OP_STAR + repeat_type;
else if (repeat_max == 1) *code++ = OP_QUERY + repeat_type;
else
{
*code++ = OP_UPTO + repeat_type;
*code++ = repeat_max >> 8;
*code++ = (repeat_max & 255);
}
}
/* The case {1,} is handled as the special case + */
else if (repeat_min == 1 && repeat_max == -1)
*code++ = OP_PLUS + repeat_type;
/* The case {n,n} is just an EXACT, while the general case {n,m} is
handled as an EXACT followed by an UPTO. An EXACT of 1 is optimized. */
else
{
if (repeat_min != 1)
{
*code++ = OP_EXACT + op_type; /* NB EXACT doesn't have repeat_type */
*code++ = repeat_min >> 8;
*code++ = (repeat_min & 255);
}
/* If the mininum is 1 and the previous item was a character string,
we either have to put back the item that got cancelled if the string
length was 1, or add the character back onto the end of a longer
string. For a character type nothing need be done; it will just get
put back naturally. Note that the final character is always going to
get added below. */
else if (*previous == OP_CHARS)
{
if (code == previous) code += 2; else previous[1]++;
}
/* For a single negated character we also have to put back the
item that got cancelled. */
else if (*previous == OP_NOT) code++;
/* If the maximum is unlimited, insert an OP_STAR. */
if (repeat_max < 0)
{
*code++ = c;
*code++ = OP_STAR + repeat_type;
}
/* Else insert an UPTO if the max is greater than the min. */
else if (repeat_max != repeat_min)
{
*code++ = c;
repeat_max -= repeat_min;
*code++ = OP_UPTO + repeat_type;
*code++ = repeat_max >> 8;
*code++ = (repeat_max & 255);
}
}
/* The character or character type itself comes last in all cases. */
*code++ = c;
}
/* If previous was a character class or a back reference, we put the repeat
stuff after it, but just skip the item if the repeat was {0,0}. */
else if (*previous == OP_CLASS || *previous == OP_REF)
{
if (repeat_max == 0)
{
code = previous;
goto END_REPEAT;
}
if (repeat_min == 0 && repeat_max == -1)
*code++ = OP_CRSTAR + repeat_type;
else if (repeat_min == 1 && repeat_max == -1)
*code++ = OP_CRPLUS + repeat_type;
else if (repeat_min == 0 && repeat_max == 1)
*code++ = OP_CRQUERY + repeat_type;
else
{
*code++ = OP_CRRANGE + repeat_type;
*code++ = repeat_min >> 8;
*code++ = repeat_min & 255;
if (repeat_max == -1) repeat_max = 0; /* 2-byte encoding for max */
*code++ = repeat_max >> 8;
*code++ = repeat_max & 255;
}
}
/* If previous was a bracket group, we may have to replicate it in certain
cases. */
else if ((int)*previous >= OP_BRA || (int)*previous == OP_ONCE ||
(int)*previous == OP_COND)
{
register int i;
int ketoffset = 0;
int len = code - previous;
uschar *bralink = NULL;
/* If the maximum repeat count is unlimited, find the end of the bracket
by scanning through from the start, and compute the offset back to it
from the current code pointer. There may be an OP_OPT setting following
the final KET, so we can't find the end just by going back from the code
pointer. */
if (repeat_max == -1)
{
register uschar *ket = previous;
do ket += (ket[1] << 8) + ket[2]; while (*ket != OP_KET);
ketoffset = code - ket;
}
/* The case of a zero minimum is special because of the need to stick
OP_BRAZERO in front of it, and because the group appears once in the
data, whereas in other cases it appears the minimum number of times. For
this reason, it is simplest to treat this case separately, as otherwise
the code gets far too mess. There are several special subcases when the
minimum is zero. */
if (repeat_min == 0)
{
/* If we set up a required char from the bracket, we must back off
to the previous value and reset the countlits value too. */
if (subcountlits > 0)
{
*reqchar = prevreqchar;
*countlits -= subcountlits;
}
/* If the maximum is also zero, we just omit the group from the output
altogether. */
if (repeat_max == 0)
{
code = previous;
goto END_REPEAT;
}
/* If the maximum is 1 or unlimited, we just have to stick in the
BRAZERO and do no more at this point. */
if (repeat_max <= 1)
{
memmove(previous+1, previous, len);
code++;
*previous++ = OP_BRAZERO + repeat_type;
}
/* If the maximum is greater than 1 and limited, we have to replicate
in a nested fashion, sticking OP_BRAZERO before each set of brackets.
The first one has to be handled carefully because it's the original
copy, which has to be moved up. The remainder can be handled by code
that is common with the non-zero minimum case below. We just have to
adjust the value or repeat_max, since one less copy is required. */
else
{
int offset;
memmove(previous+4, previous, len);
code += 4;
*previous++ = OP_BRAZERO + repeat_type;
*previous++ = OP_BRA;
/* We chain together the bracket offset fields that have to be
filled in later when the ends of the brackets are reached. */
offset = (bralink == NULL)? 0 : previous - bralink;
bralink = previous;
*previous++ = offset >> 8;
*previous++ = offset & 255;
}
repeat_max--;
}
/* If the minimum is greater than zero, replicate the group as many
times as necessary, and adjust the maximum to the number of subsequent
copies that we need. */
else
{
for (i = 1; i < repeat_min; i++)
{
memcpy(code, previous, len);
code += len;
}
if (repeat_max > 0) repeat_max -= repeat_min;
}
/* This code is common to both the zero and non-zero minimum cases. If
the maximum is limited, it replicates the group in a nested fashion,
remembering the bracket starts on a stack. In the case of a zero minimum,
the first one was set up above. In all cases the repeat_max now specifies
the number of additional copies needed. */
if (repeat_max >= 0)
{
for (i = repeat_max - 1; i >= 0; i--)
{
*code++ = OP_BRAZERO + repeat_type;
/* All but the final copy start a new nesting, maintaining the
chain of brackets outstanding. */
if (i != 0)
{
int offset;
*code++ = OP_BRA;
offset = (bralink == NULL)? 0 : code - bralink;
bralink = code;
*code++ = offset >> 8;
*code++ = offset & 255;
}
memcpy(code, previous, len);
code += len;
}
/* Now chain through the pending brackets, and fill in their length
fields (which are holding the chain links pro tem). */
while (bralink != NULL)
{
int oldlinkoffset;
int offset = code - bralink + 1;
uschar *bra = code - offset;
oldlinkoffset = (bra[1] << 8) + bra[2];
bralink = (oldlinkoffset == 0)? NULL : bralink - oldlinkoffset;
*code++ = OP_KET;
*code++ = bra[1] = offset >> 8;
*code++ = bra[2] = (offset & 255);
}
}
/* If the maximum is unlimited, set a repeater in the final copy. We
can't just offset backwards from the current code point, because we
don't know if there's been an options resetting after the ket. The
correct offset was computed above. */
else code[-ketoffset] = OP_KETRMAX + repeat_type;
}
/* Else there's some kind of shambles */
else
{
*errorptr = ERR11;
goto FAILED;
}
/* In all case we no longer have a previous item. */
END_REPEAT:
previous = NULL;
break;
/* Start of nested bracket sub-expression, or comment or lookahead or
lookbehind or option setting or condition. First deal with special things
that can come after a bracket; all are introduced by ?, and the appearance
of any of them means that this is not a referencing group. They were
checked for validity in the first pass over the string, so we don't have to
check for syntax errors here. */
case '(':
newoptions = options;
condref = -1;
if (*(++ptr) == '?')
{
int set, unset;
int *optset;
switch (*(++ptr))
{
case '#': /* Comment; skip to ket */
ptr++;
while (*ptr != ')') ptr++;
continue;
case ':': /* Non-extracting bracket */
bravalue = OP_BRA;
ptr++;
break;
case '(':
bravalue = OP_COND; /* Conditional group */
if ((cd->ctypes[*(++ptr)] & ctype_digit) != 0)
{
condref = *ptr - '0';
while (*(++ptr) != ')') condref = condref*10 + *ptr - '0';
if (condref == 0)
{
*errorptr = ERR35;
goto FAILED;
}
ptr++;
}
else ptr--;
break;
case '=': /* Positive lookahead */
bravalue = OP_ASSERT;
ptr++;
break;
case '!': /* Negative lookahead */
bravalue = OP_ASSERT_NOT;
ptr++;
break;
case '<': /* Lookbehinds */
switch (*(++ptr))
{
case '=': /* Positive lookbehind */
bravalue = OP_ASSERTBACK;
ptr++;
break;
case '!': /* Negative lookbehind */
bravalue = OP_ASSERTBACK_NOT;
ptr++;
break;
default: /* Syntax error */
*errorptr = ERR24;
goto FAILED;
}
break;
case '>': /* One-time brackets */
bravalue = OP_ONCE;
ptr++;
break;
case 'R': /* Pattern recursion */
*code++ = OP_RECURSE;
ptr++;
continue;
default: /* Option setting */
set = unset = 0;
optset = &set;
while (*ptr != ')' && *ptr != ':')
{
switch (*ptr++)
{
case '-': optset = &unset; break;
case 'i': *optset |= PCRE_CASELESS; break;
case 'm': *optset |= PCRE_MULTILINE; break;
case 's': *optset |= PCRE_DOTALL; break;
case 'x': *optset |= PCRE_EXTENDED; break;
case 'U': *optset |= PCRE_UNGREEDY; break;
case 'X': *optset |= PCRE_EXTRA; break;
default:
*errorptr = ERR12;
goto FAILED;
}
}
/* Set up the changed option bits, but don't change anything yet. */
newoptions = (options | set) & (~unset);
/* If the options ended with ')' this is not the start of a nested
group with option changes, so the options change at this level. At top
level there is nothing else to be done (the options will in fact have
been set from the start of compiling as a result of the first pass) but
at an inner level we must compile code to change the ims options if
necessary, and pass the new setting back so that it can be put at the
start of any following branches, and when this group ends, a resetting
item can be compiled. */
if (*ptr == ')')
{
if ((options & PCRE_INGROUP) != 0 &&
(options & PCRE_IMS) != (newoptions & PCRE_IMS))
{
*code++ = OP_OPT;
*code++ = *optchanged = newoptions & PCRE_IMS;
}
options = newoptions; /* Change options at this level */
previous = NULL; /* This item can't be repeated */
continue; /* It is complete */
}
/* If the options ended with ':' we are heading into a nested group
with possible change of options. Such groups are non-capturing and are
not assertions of any kind. All we need to do is skip over the ':';
the newoptions value is handled below. */
bravalue = OP_BRA;
ptr++;
}
}
/* Else we have a referencing group; adjust the opcode. */
else
{
if (++(*brackets) > EXTRACT_MAX)
{
*errorptr = ERR13;
goto FAILED;
}
bravalue = OP_BRA + *brackets;
}
/* Process nested bracketed re. Assertions may not be repeated, but other
kinds can be. We copy code into a non-register variable in order to be able
to pass its address because some compilers complain otherwise. Pass in a
new setting for the ims options if they have changed. */
previous = (bravalue >= OP_ONCE)? code : NULL;
*code = bravalue;
tempcode = code;
if (!compile_regex(
options | PCRE_INGROUP, /* Set for all nested groups */
((options & PCRE_IMS) != (newoptions & PCRE_IMS))?
newoptions & PCRE_IMS : -1, /* Pass ims options if changed */
brackets, /* Bracket level */
&tempcode, /* Where to put code (updated) */
&ptr, /* Input pointer (updated) */
errorptr, /* Where to put an error message */
(bravalue == OP_ASSERTBACK ||
bravalue == OP_ASSERTBACK_NOT), /* TRUE if back assert */
condref, /* Condition reference number */
&subreqchar, /* For possible last char */
&subcountlits, /* For literal count */
cd)) /* Tables block */
goto FAILED;
/* At the end of compiling, code is still pointing to the start of the
group, while tempcode has been updated to point past the end of the group
and any option resetting that may follow it. The pattern pointer (ptr)
is on the bracket. */
/* If this is a conditional bracket, check that there are no more than
two branches in the group. */
if (bravalue == OP_COND)
{
uschar *tc = code;
condcount = 0;
do {
condcount++;
tc += (tc[1] << 8) | tc[2];
}
while (*tc != OP_KET);
if (condcount > 2)
{
*errorptr = ERR27;
goto FAILED;
}
}
/* Handle updating of the required character. If the subpattern didn't
set one, leave it as it was. Otherwise, update it for normal brackets of
all kinds, forward assertions, and conditions with two branches. Don't
update the literal count for forward assertions, however. If the bracket
is followed by a quantifier with zero repeat, we have to back off. Hence
the definition of prevreqchar and subcountlits outside the main loop so
that they can be accessed for the back off. */
if (subreqchar > 0 &&
(bravalue >= OP_BRA || bravalue == OP_ONCE || bravalue == OP_ASSERT ||
(bravalue == OP_COND && condcount == 2)))
{
prevreqchar = *reqchar;
*reqchar = subreqchar;
if (bravalue != OP_ASSERT) *countlits += subcountlits;
}
/* Now update the main code pointer to the end of the group. */
code = tempcode;
/* Error if hit end of pattern */
if (*ptr != ')')
{
*errorptr = ERR14;
goto FAILED;
}
break;
/* Check \ for being a real metacharacter; if not, fall through and handle
it as a data character at the start of a string. Escape items are checked
for validity in the pre-compiling pass. */
case '\\':
tempptr = ptr;
c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
/* Handle metacharacters introduced by \. For ones like \d, the ESC_ values
are arranged to be the negation of the corresponding OP_values. For the
back references, the values are ESC_REF plus the reference number. Only
back references and those types that consume a character may be repeated.
We can test for values between ESC_b and ESC_Z for the latter; this may
have to change if any new ones are ever created. */
if (c < 0)
{
if (-c >= ESC_REF)
{
previous = code;
*code++ = OP_REF;
*code++ = -c - ESC_REF;
}
else
{
previous = (-c > ESC_b && -c < ESC_Z)? code : NULL;
*code++ = -c;
}
continue;
}
/* Data character: reset and fall through */
ptr = tempptr;
c = '\\';
/* Handle a run of data characters until a metacharacter is encountered.
The first character is guaranteed not to be whitespace or # when the
extended flag is set. */
NORMAL_CHAR:
default:
previous = code;
*code = OP_CHARS;
code += 2;
length = 0;
do
{
if ((options & PCRE_EXTENDED) != 0)
{
if ((cd->ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
on the Macintosh. */
while ((c = *(++ptr)) != 0 && c != '\n') ;
if (c == 0) break;
continue;
}
}
/* Backslash may introduce a data char or a metacharacter. Escaped items
are checked for validity in the pre-compiling pass. Stop the string
before a metaitem. */
if (c == '\\')
{
tempptr = ptr;
c = check_escape(&ptr, errorptr, *brackets, options, FALSE, cd);
if (c < 0) { ptr = tempptr; break; }
/* If a character is > 127 in UTF-8 mode, we have to turn it into
two or more characters in the UTF-8 encoding. */
#ifdef SUPPORT_UTF8
if (c > 127 && (options & PCRE_UTF8) != 0)
{
uschar buffer[8];
int len = ord2utf8(c, buffer);
for (c = 0; c < len; c++) *code++ = buffer[c];
length += len;
continue;
}
#endif
}
/* Ordinary character or single-char escape */
*code++ = c;
length++;
}
/* This "while" is the end of the "do" above. */
while (length < MAXLIT && (cd->ctypes[c = *(++ptr)] & ctype_meta) == 0);
/* Update the last character and the count of literals */
prevreqchar = (length > 1)? code[-2] : *reqchar;
*reqchar = code[-1];
*countlits += length;
/* Compute the length and set it in the data vector, and advance to
the next state. */
previous[1] = length;
if (length < MAXLIT) ptr--;
break;
}
} /* end of big loop */
/* Control never reaches here by falling through, only by a goto for all the
error states. Pass back the position in the pattern so that it can be displayed
to the user for diagnosing the error. */
FAILED:
*ptrptr = ptr;
return FALSE;
}
/*************************************************
* Compile sequence of alternatives *
*************************************************/
/* On entry, ptr is pointing past the bracket character, but on return
it points to the closing bracket, or vertical bar, or end of string.
The code variable is pointing at the byte into which the BRA operator has been
stored. If the ims options are changed at the start (for a (?ims: group) or
during any branch, we need to insert an OP_OPT item at the start of every
following branch to ensure they get set correctly at run time, and also pass
the new options into every subsequent branch compile.
Argument:
options the option bits
optchanged new ims options to set as if (?ims) were at the start, or -1
for no change
brackets -> int containing the number of extracting brackets used
codeptr -> the address of the current code pointer
ptrptr -> the address of the current pattern pointer
errorptr -> pointer to error message
lookbehind TRUE if this is a lookbehind assertion
condref >= 0 for OPT_CREF setting at start of conditional group
reqchar -> place to put the last required character, or a negative number
countlits -> place to put the shortest literal count of any branch
cd points to the data block with tables pointers
Returns: TRUE on success
*/
static BOOL
compile_regex(int options, int optchanged, int *brackets, uschar **codeptr,
const uschar **ptrptr, const char **errorptr, BOOL lookbehind, int condref,
int *reqchar, int *countlits, compile_data *cd)
{
const uschar *ptr = *ptrptr;
uschar *code = *codeptr;
uschar *last_branch = code;
uschar *start_bracket = code;
uschar *reverse_count = NULL;
int oldoptions = options & PCRE_IMS;
int branchreqchar, branchcountlits;
*reqchar = -1;
*countlits = INT_MAX;
code += 3;
/* At the start of a reference-based conditional group, insert the reference
number as an OP_CREF item. */
if (condref >= 0)
{
*code++ = OP_CREF;
*code++ = condref;
}
/* Loop for each alternative branch */
for (;;)
{
int length;
/* Handle change of options */
if (optchanged >= 0)
{
*code++ = OP_OPT;
*code++ = optchanged;
options = (options & ~PCRE_IMS) | optchanged;
}
/* Set up dummy OP_REVERSE if lookbehind assertion */
if (lookbehind)
{
*code++ = OP_REVERSE;
reverse_count = code;
*code++ = 0;
*code++ = 0;
}
/* Now compile the branch */
if (!compile_branch(options, brackets, &code, &ptr, errorptr, &optchanged,
&branchreqchar, &branchcountlits, cd))
{
*ptrptr = ptr;
return FALSE;
}
/* Fill in the length of the last branch */
length = code - last_branch;
last_branch[1] = length >> 8;
last_branch[2] = length & 255;
/* Save the last required character if all branches have the same; a current
value of -1 means unset, while -2 means "previous branch had no last required
char". */
if (*reqchar != -2)
{
if (branchreqchar >= 0)
{
if (*reqchar == -1) *reqchar = branchreqchar;
else if (*reqchar != branchreqchar) *reqchar = -2;
}
else *reqchar = -2;
}
/* Keep the shortest literal count */
if (branchcountlits < *countlits) *countlits = branchcountlits;
DPRINTF(("literal count = %d min=%d\n", branchcountlits, *countlits));
/* If lookbehind, check that this branch matches a fixed-length string,
and put the length into the OP_REVERSE item. Temporarily mark the end of
the branch with OP_END. */
if (lookbehind)
{
*code = OP_END;
length = find_fixedlength(last_branch, options);
DPRINTF(("fixed length = %d\n", length));
if (length < 0)
{
*errorptr = ERR25;
*ptrptr = ptr;
return FALSE;
}
reverse_count[0] = (length >> 8);
reverse_count[1] = length & 255;
}
/* Reached end of expression, either ')' or end of pattern. Insert a
terminating ket and the length of the whole bracketed item, and return,
leaving the pointer at the terminating char. If any of the ims options
were changed inside the group, compile a resetting op-code following. */
if (*ptr != '|')
{
length = code - start_bracket;
*code++ = OP_KET;
*code++ = length >> 8;
*code++ = length & 255;
if (optchanged >= 0)
{
*code++ = OP_OPT;
*code++ = oldoptions;
}
*codeptr = code;
*ptrptr = ptr;
return TRUE;
}
/* Another branch follows; insert an "or" node and advance the pointer. */
*code = OP_ALT;
last_branch = code;
code += 3;
ptr++;
}
/* Control never reaches here */
}
/*************************************************
* Find first significant op code *
*************************************************/
/* This is called by several functions that scan a compiled expression looking
for a fixed first character, or an anchoring op code etc. It skips over things
that do not influence this. For one application, a change of caseless option is
important.
Arguments:
code pointer to the start of the group
options pointer to external options
optbit the option bit whose changing is significant, or
zero if none are
optstop TRUE to return on option change, otherwise change the options
value and continue
Returns: pointer to the first significant opcode
*/
static const uschar*
first_significant_code(const uschar *code, int *options, int optbit,
BOOL optstop)
{
for (;;)
{
switch ((int)*code)
{
case OP_OPT:
if (optbit > 0 && ((int)code[1] & optbit) != (*options & optbit))
{
if (optstop) return code;
*options = (int)code[1];
}
code += 2;
break;
case OP_CREF:
code += 2;
break;
case OP_WORD_BOUNDARY:
case OP_NOT_WORD_BOUNDARY:
code++;
break;
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
do code += (code[1] << 8) + code[2]; while (*code == OP_ALT);
code += 3;
break;
default:
return code;
}
}
/* Control never reaches here */
}
/*************************************************
* Check for anchored expression *
*************************************************/
/* Try to find out if this is an anchored regular expression. Consider each
alternative branch. If they all start with OP_SOD or OP_CIRC, or with a bracket
all of whose alternatives start with OP_SOD or OP_CIRC (recurse ad lib), then
it's anchored. However, if this is a multiline pattern, then only OP_SOD
counts, since OP_CIRC can match in the middle.
A branch is also implicitly anchored if it starts with .* and DOTALL is set,
because that will try the rest of the pattern at all possible matching points,
so there is no point trying them again.
Arguments:
code points to start of expression (the bracket)
options points to the options setting
Returns: TRUE or FALSE
*/
static BOOL
is_anchored(register const uschar *code, int *options)
{
do {
const uschar *scode = first_significant_code(code + 3, options,
PCRE_MULTILINE, FALSE);
register int op = *scode;
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
{ if (!is_anchored(scode, options)) return FALSE; }
else if ((op == OP_TYPESTAR || op == OP_TYPEMINSTAR) &&
(*options & PCRE_DOTALL) != 0)
{ if (scode[1] != OP_ANY) return FALSE; }
else if (op != OP_SOD &&
((*options & PCRE_MULTILINE) != 0 || op != OP_CIRC))
return FALSE;
code += (code[1] << 8) + code[2];
}
while (*code == OP_ALT);
return TRUE;
}
/*************************************************
* Check for starting with ^ or .* *
*************************************************/
/* This is called to find out if every branch starts with ^ or .* so that
"first char" processing can be done to speed things up in multiline
matching and for non-DOTALL patterns that start with .* (which must start at
the beginning or after \n).
Argument: points to start of expression (the bracket)
Returns: TRUE or FALSE
*/
static BOOL
is_startline(const uschar *code)
{
do {
const uschar *scode = first_significant_code(code + 3, NULL, 0, FALSE);
register int op = *scode;
if (op >= OP_BRA || op == OP_ASSERT || op == OP_ONCE || op == OP_COND)
{ if (!is_startline(scode)) return FALSE; }
else if (op == OP_TYPESTAR || op == OP_TYPEMINSTAR)
{ if (scode[1] != OP_ANY) return FALSE; }
else if (op != OP_CIRC) return FALSE;
code += (code[1] << 8) + code[2];
}
while (*code == OP_ALT);
return TRUE;
}
/*************************************************
* Check for fixed first char *
*************************************************/
/* Try to find out if there is a fixed first character. This is called for
unanchored expressions, as it speeds up their processing quite considerably.
Consider each alternative branch. If they all start with the same char, or with
a bracket all of whose alternatives start with the same char (recurse ad lib),
then we return that char, otherwise -1.
Arguments:
code points to start of expression (the bracket)
options pointer to the options (used to check casing changes)
Returns: -1 or the fixed first char
*/
static int
find_firstchar(const uschar *code, int *options)
{
register int c = -1;
do {
int d;
const uschar *scode = first_significant_code(code + 3, options,
PCRE_CASELESS, TRUE);
register int op = *scode;
if (op >= OP_BRA) op = OP_BRA;
switch(op)
{
default:
return -1;
case OP_BRA:
case OP_ASSERT:
case OP_ONCE:
case OP_COND:
if ((d = find_firstchar(scode, options)) < 0) return -1;
if (c < 0) c = d; else if (c != d) return -1;
break;
case OP_EXACT: /* Fall through */
scode++;
case OP_CHARS: /* Fall through */
scode++;
case OP_PLUS:
case OP_MINPLUS:
if (c < 0) c = scode[1]; else if (c != scode[1]) return -1;
break;
}
code += (code[1] << 8) + code[2];
}
while (*code == OP_ALT);
return c;
}
/*************************************************
* Compile a Regular Expression *
*************************************************/
/* This function takes a string and returns a pointer to a block of store
holding a compiled version of the expression.
Arguments:
pattern the regular expression
options various option bits
errorptr pointer to pointer to error text
erroroffset ptr offset in pattern where error was detected
tables pointer to character tables or NULL
Returns: pointer to compiled data block, or NULL on error,
with errorptr and erroroffset set
*/
pcre *
pcre_compile(const char *pattern, int options, const char **errorptr,
int *erroroffset, const unsigned char *tables)
{
real_pcre *re;
int length = 3; /* For initial BRA plus length */
int runlength;
int c, reqchar, countlits;
int bracount = 0;
int top_backref = 0;
int branch_extra = 0;
int branch_newextra;
unsigned int brastackptr = 0;
size_t size;
uschar *code;
const uschar *ptr;
compile_data compile_block;
int brastack[BRASTACK_SIZE];
uschar bralenstack[BRASTACK_SIZE];
#ifdef DEBUG
uschar *code_base, *code_end;
#endif
/* Can't support UTF8 unless PCRE has been compiled to include the code. */
#ifndef SUPPORT_UTF8
if ((options & PCRE_UTF8) != 0)
{
*errorptr = ERR32;
return NULL;
}
#endif
/* We can't pass back an error message if errorptr is NULL; I guess the best we
can do is just return NULL. */
if (errorptr == NULL) return NULL;
*errorptr = NULL;
/* However, we can give a message for this error */
if (erroroffset == NULL)
{
*errorptr = ERR16;
return NULL;
}
*erroroffset = 0;
if ((options & ~PUBLIC_OPTIONS) != 0)
{
*errorptr = ERR17;
return NULL;
}
/* Set up pointers to the individual character tables */
if (tables == NULL) tables = pcre_default_tables;
compile_block.lcc = tables + lcc_offset;
compile_block.fcc = tables + fcc_offset;
compile_block.cbits = tables + cbits_offset;
compile_block.ctypes = tables + ctypes_offset;
/* Reflect pattern for debugging output */
DPRINTF(("------------------------------------------------------------------\n"));
DPRINTF(("%s\n", pattern));
/* The first thing to do is to make a pass over the pattern to compute the
amount of store required to hold the compiled code. This does not have to be
perfect as long as errors are overestimates. At the same time we can detect any
internal flag settings. Make an attempt to correct for any counted white space
if an "extended" flag setting appears late in the pattern. We can't be so
clever for #-comments. */
ptr = (const uschar *)(pattern - 1);
while ((c = *(++ptr)) != 0)
{
int min, max;
int class_charcount;
if ((options & PCRE_EXTENDED) != 0)
{
if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
on the Macintosh. */
while ((c = *(++ptr)) != 0 && c != '\n') ;
continue;
}
}
switch(c)
{
/* A backslashed item may be an escaped "normal" character or a
character type. For a "normal" character, put the pointers and
character back so that tests for whitespace etc. in the input
are done correctly. */
case '\\':
{
const uschar *save_ptr = ptr;
c = check_escape(&ptr, errorptr, bracount, options, FALSE, &compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
if (c >= 0)
{
ptr = save_ptr;
c = '\\';
goto NORMAL_CHAR;
}
}
length++;
/* A back reference needs an additional char, plus either one or 5
bytes for a repeat. We also need to keep the value of the highest
back reference. */
if (c <= -ESC_REF)
{
int refnum = -c - ESC_REF;
if (refnum > top_backref) top_backref = refnum;
length++; /* For single back reference */
if (ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
{
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
if ((min == 0 && (max == 1 || max == -1)) ||
(min == 1 && max == -1))
length++;
else length += 5;
if (ptr[1] == '?') ptr++;
}
}
continue;
case '^':
case '.':
case '$':
case '*': /* These repeats won't be after brackets; */
case '+': /* those are handled separately */
case '?':
length++;
continue;
/* This covers the cases of repeats after a single char, metachar, class,
or back reference. */
case '{':
if (!is_counted_repeat(ptr+1, &compile_block)) goto NORMAL_CHAR;
ptr = read_repeat_counts(ptr+1, &min, &max, errorptr, &compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
if ((min == 0 && (max == 1 || max == -1)) ||
(min == 1 && max == -1))
length++;
else
{
length--; /* Uncount the original char or metachar */
if (min == 1) length++; else if (min > 0) length += 4;
if (max > 0) length += 4; else length += 2;
}
if (ptr[1] == '?') ptr++;
continue;
/* An alternation contains an offset to the next branch or ket. If any ims
options changed in the previous branch(es), and/or if we are in a
lookbehind assertion, extra space will be needed at the start of the
branch. This is handled by branch_extra. */
case '|':
length += 3 + branch_extra;
continue;
/* A character class uses 33 characters. Don't worry about character types
that aren't allowed in classes - they'll get picked up during the compile.
A character class that contains only one character uses 2 or 3 bytes,
depending on whether it is negated or not. Notice this where we can. */
case '[':
class_charcount = 0;
if (*(++ptr) == '^') ptr++;
do
{
if (*ptr == '\\')
{
int ch = check_escape(&ptr, errorptr, bracount, options, TRUE,
&compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
if (-ch == ESC_b) class_charcount++; else class_charcount = 10;
}
else class_charcount++;
ptr++;
}
while (*ptr != 0 && *ptr != ']');
/* Repeats for negated single chars are handled by the general code */
if (class_charcount == 1) length += 3; else
{
length += 33;
/* A repeat needs either 1 or 5 bytes. */
if (*ptr != 0 && ptr[1] == '{' && is_counted_repeat(ptr+2, &compile_block))
{
ptr = read_repeat_counts(ptr+2, &min, &max, errorptr, &compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
if ((min == 0 && (max == 1 || max == -1)) ||
(min == 1 && max == -1))
length++;
else length += 5;
if (ptr[1] == '?') ptr++;
}
}
continue;
/* Brackets may be genuine groups or special things */
case '(':
branch_newextra = 0;
/* Handle special forms of bracket, which all start (? */
if (ptr[1] == '?')
{
int set, unset;
int *optset;
switch (c = ptr[2])
{
/* Skip over comments entirely */
case '#':
ptr += 3;
while (*ptr != 0 && *ptr != ')') ptr++;
if (*ptr == 0)
{
*errorptr = ERR18;
goto PCRE_ERROR_RETURN;
}
continue;
/* Non-referencing groups and lookaheads just move the pointer on, and
then behave like a non-special bracket, except that they don't increment
the count of extracting brackets. Ditto for the "once only" bracket,
which is in Perl from version 5.005. */
case ':':
case '=':
case '!':
case '>':
ptr += 2;
break;
/* A recursive call to the regex is an extension, to provide the
facility which can be obtained by $(?p{perl-code}) in Perl 5.6. */
case 'R':
if (ptr[3] != ')')
{
*errorptr = ERR29;
goto PCRE_ERROR_RETURN;
}
ptr += 3;
length += 1;
break;
/* Lookbehinds are in Perl from version 5.005 */
case '<':
if (ptr[3] == '=' || ptr[3] == '!')
{
ptr += 3;
branch_newextra = 3;
length += 3; /* For the first branch */
break;
}
*errorptr = ERR24;
goto PCRE_ERROR_RETURN;
/* Conditionals are in Perl from version 5.005. The bracket must either
be followed by a number (for bracket reference) or by an assertion
group. */
case '(':
if ((compile_block.ctypes[ptr[3]] & ctype_digit) != 0)
{
ptr += 4;
length += 2;
while ((compile_block.ctypes[*ptr] & ctype_digit) != 0) ptr++;
if (*ptr != ')')
{
*errorptr = ERR26;
goto PCRE_ERROR_RETURN;
}
}
else /* An assertion must follow */
{
ptr++; /* Can treat like ':' as far as spacing is concerned */
if (ptr[2] != '?' ||
(ptr[3] != '=' && ptr[3] != '!' && ptr[3] != '<') )
{
ptr += 2; /* To get right offset in message */
*errorptr = ERR28;
goto PCRE_ERROR_RETURN;
}
}
break;
/* Else loop checking valid options until ) is met. Anything else is an
error. If we are without any brackets, i.e. at top level, the settings
act as if specified in the options, so massage the options immediately.
This is for backward compatibility with Perl 5.004. */
default:
set = unset = 0;
optset = &set;
ptr += 2;
for (;; ptr++)
{
c = *ptr;
switch (c)
{
case 'i':
*optset |= PCRE_CASELESS;
continue;
case 'm':
*optset |= PCRE_MULTILINE;
continue;
case 's':
*optset |= PCRE_DOTALL;
continue;
case 'x':
*optset |= PCRE_EXTENDED;
continue;
case 'X':
*optset |= PCRE_EXTRA;
continue;
case 'U':
*optset |= PCRE_UNGREEDY;
continue;
case '-':
optset = &unset;
continue;
/* A termination by ')' indicates an options-setting-only item;
this is global at top level; otherwise nothing is done here and
it is handled during the compiling process on a per-bracket-group
basis. */
case ')':
if (brastackptr == 0)
{
options = (options | set) & (~unset);
set = unset = 0; /* To save length */
}
/* Fall through */
/* A termination by ':' indicates the start of a nested group with
the given options set. This is again handled at compile time, but
we must allow for compiled space if any of the ims options are
set. We also have to allow for resetting space at the end of
the group, which is why 4 is added to the length and not just 2.
If there are several changes of options within the same group, this
will lead to an over-estimate on the length, but this shouldn't
matter very much. We also have to allow for resetting options at
the start of any alternations, which we do by setting
branch_newextra to 2. Finally, we record whether the case-dependent
flag ever changes within the regex. This is used by the "required
character" code. */
case ':':
if (((set|unset) & PCRE_IMS) != 0)
{
length += 4;
branch_newextra = 2;
if (((set|unset) & PCRE_CASELESS) != 0) options |= PCRE_ICHANGED;
}
goto END_OPTIONS;
/* Unrecognized option character */
default:
*errorptr = ERR12;
goto PCRE_ERROR_RETURN;
}
}
/* If we hit a closing bracket, that's it - this is a freestanding
option-setting. We need to ensure that branch_extra is updated if
necessary. The only values branch_newextra can have here are 0 or 2.
If the value is 2, then branch_extra must either be 2 or 5, depending
on whether this is a lookbehind group or not. */
END_OPTIONS:
if (c == ')')
{
if (branch_newextra == 2 && (branch_extra == 0 || branch_extra == 3))
branch_extra += branch_newextra;
continue;
}
/* If options were terminated by ':' control comes here. Fall through
to handle the group below. */
}
}
/* Extracting brackets must be counted so we can process escapes in a
Perlish way. */
else bracount++;
/* Non-special forms of bracket. Save length for computing whole length
at end if there's a repeat that requires duplication of the group. Also
save the current value of branch_extra, and start the new group with
the new value. If non-zero, this will either be 2 for a (?imsx: group, or 3
for a lookbehind assertion. */
if (brastackptr >= sizeof(brastack)/sizeof(int))
{
*errorptr = ERR19;
goto PCRE_ERROR_RETURN;
}
bralenstack[brastackptr] = branch_extra;
branch_extra = branch_newextra;
brastack[brastackptr++] = length;
length += 3;
continue;
/* Handle ket. Look for subsequent max/min; for certain sets of values we
have to replicate this bracket up to that many times. If brastackptr is
0 this is an unmatched bracket which will generate an error, but take care
not to try to access brastack[-1] when computing the length and restoring
the branch_extra value. */
case ')':
length += 3;
{
int minval = 1;
int maxval = 1;
int duplength;
if (brastackptr > 0)
{
duplength = length - brastack[--brastackptr];
branch_extra = bralenstack[brastackptr];
}
else duplength = 0;
/* Leave ptr at the final char; for read_repeat_counts this happens
automatically; for the others we need an increment. */
if ((c = ptr[1]) == '{' && is_counted_repeat(ptr+2, &compile_block))
{
ptr = read_repeat_counts(ptr+2, &minval, &maxval, errorptr,
&compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
}
else if (c == '*') { minval = 0; maxval = -1; ptr++; }
else if (c == '+') { maxval = -1; ptr++; }
else if (c == '?') { minval = 0; ptr++; }
/* If the minimum is zero, we have to allow for an OP_BRAZERO before the
group, and if the maximum is greater than zero, we have to replicate
maxval-1 times; each replication acquires an OP_BRAZERO plus a nesting
bracket set - hence the 7. */
if (minval == 0)
{
length++;
if (maxval > 0) length += (maxval - 1) * (duplength + 7);
}
/* When the minimum is greater than zero, 1 we have to replicate up to
minval-1 times, with no additions required in the copies. Then, if
there is a limited maximum we have to replicate up to maxval-1 times
allowing for a BRAZERO item before each optional copy and nesting
brackets for all but one of the optional copies. */
else
{
length += (minval - 1) * duplength;
if (maxval > minval) /* Need this test as maxval=-1 means no limit */
length += (maxval - minval) * (duplength + 7) - 6;
}
}
continue;
/* Non-special character. For a run of such characters the length required
is the number of characters + 2, except that the maximum run length is 255.
We won't get a skipped space or a non-data escape or the start of a #
comment as the first character, so the length can't be zero. */
NORMAL_CHAR:
default:
length += 2;
runlength = 0;
do
{
if ((options & PCRE_EXTENDED) != 0)
{
if ((compile_block.ctypes[c] & ctype_space) != 0) continue;
if (c == '#')
{
/* The space before the ; is to avoid a warning on a silly compiler
on the Macintosh. */
while ((c = *(++ptr)) != 0 && c != '\n') ;
continue;
}
}
/* Backslash may introduce a data char or a metacharacter; stop the
string before the latter. */
if (c == '\\')
{
const uschar *saveptr = ptr;
c = check_escape(&ptr, errorptr, bracount, options, FALSE,
&compile_block);
if (*errorptr != NULL) goto PCRE_ERROR_RETURN;
if (c < 0) { ptr = saveptr; break; }
#ifdef SUPPORT_UTF8
if (c > 127 && (options & PCRE_UTF8) != 0)
{
int i;
for (i = 0; i < sizeof(utf8_table1)/sizeof(int); i++)
if (c <= utf8_table1[i]) break;
runlength += i;
}
#endif
}
/* Ordinary character or single-char escape */
runlength++;
}
/* This "while" is the end of the "do" above. */
while (runlength < MAXLIT &&
(compile_block.ctypes[c = *(++ptr)] & ctype_meta) == 0);
ptr--;
length += runlength;
continue;
}
}
length += 4; /* For final KET and END */
if (length > 65539)
{
*errorptr = ERR20;
return NULL;
}
/* Compute the size of data block needed and get it, either from malloc or
externally provided function. We specify "code[0]" in the offsetof() expression
rather than just "code", because it has been reported that one broken compiler
fails on "code" because it is also an independent variable. It should make no
difference to the value of the offsetof(). */
size = length + offsetof(real_pcre, code[0]);
re = (real_pcre *)(pcre_malloc)(size);
if (re == NULL)
{
*errorptr = ERR21;
return NULL;
}
/* Put in the magic number, and save the size, options, and table pointer */
re->magic_number = MAGIC_NUMBER;
re->size = size;
re->options = options;
re->tables = tables;
/* Set up a starting, non-extracting bracket, then compile the expression. On
error, *errorptr will be set non-NULL, so we don't need to look at the result
of the function here. */
ptr = (const uschar *)pattern;
code = re->code;
*code = OP_BRA;
bracount = 0;
(void)compile_regex(options, -1, &bracount, &code, &ptr, errorptr, FALSE, -1,
&reqchar, &countlits, &compile_block);
re->top_bracket = bracount;
re->top_backref = top_backref;
/* If not reached end of pattern on success, there's an excess bracket. */
if (*errorptr == NULL && *ptr != 0) *errorptr = ERR22;
/* Fill in the terminating state and check for disastrous overflow, but
if debugging, leave the test till after things are printed out. */
*code++ = OP_END;
#ifndef DEBUG
if (code - re->code > length) *errorptr = ERR23;
#endif
/* Give an error if there's back reference to a non-existent capturing
subpattern. */
if (top_backref > re->top_bracket) *errorptr = ERR15;
/* Failed to compile */
if (*errorptr != NULL)
{
(pcre_free)(re);
PCRE_ERROR_RETURN:
*erroroffset = ptr - (const uschar *)pattern;
return NULL;
}
/* If the anchored option was not passed, set flag if we can determine that the
pattern is anchored by virtue of ^ characters or \A or anything else (such as
starting with .* when DOTALL is set).
Otherwise, see if we can determine what the first character has to be, because
that speeds up unanchored matches no end. If not, see if we can set the
PCRE_STARTLINE flag. This is helpful for multiline matches when all branches
start with ^. and also when all branches start with .* for non-DOTALL matches.
*/
if ((options & PCRE_ANCHORED) == 0)
{
int temp_options = options;
if (is_anchored(re->code, &temp_options))
re->options |= PCRE_ANCHORED;
else
{
int ch = find_firstchar(re->code, &temp_options);
if (ch >= 0)
{
re->first_char = ch;
re->options |= PCRE_FIRSTSET;
}
else if (is_startline(re->code))
re->options |= PCRE_STARTLINE;
}
}
/* Save the last required character if there are at least two literal
characters on all paths, or if there is no first character setting. */
if (reqchar >= 0 && (countlits > 1 || (re->options & PCRE_FIRSTSET) == 0))
{
re->req_char = reqchar;
re->options |= PCRE_REQCHSET;
}
/* Print out the compiled data for debugging */
#ifdef DEBUG
printf("Length = %d top_bracket = %d top_backref = %d\n",
length, re->top_bracket, re->top_backref);
if (re->options != 0)
{
printf("%s%s%s%s%s%s%s%s%s\n",
((re->options & PCRE_ANCHORED) != 0)? "anchored " : "",
((re->options & PCRE_CASELESS) != 0)? "caseless " : "",
((re->options & PCRE_ICHANGED) != 0)? "case state changed " : "",
((re->options & PCRE_EXTENDED) != 0)? "extended " : "",
((re->options & PCRE_MULTILINE) != 0)? "multiline " : "",
((re->options & PCRE_DOTALL) != 0)? "dotall " : "",
((re->options & PCRE_DOLLAR_ENDONLY) != 0)? "endonly " : "",
((re->options & PCRE_EXTRA) != 0)? "extra " : "",
((re->options & PCRE_UNGREEDY) != 0)? "ungreedy " : "");
}
if ((re->options & PCRE_FIRSTSET) != 0)
{
if (isprint(re->first_char)) printf("First char = %c\n", re->first_char);
else printf("First char = \\x%02x\n", re->first_char);
}
if ((re->options & PCRE_REQCHSET) != 0)
{
if (isprint(re->req_char)) printf("Req char = %c\n", re->req_char);
else printf("Req char = \\x%02x\n", re->req_char);
}
code_end = code;
code_base = code = re->code;
while (code < code_end)
{
int charlength;
printf("%3d ", code - code_base);
if (*code >= OP_BRA)
{
printf("%3d Bra %d", (code[1] << 8) + code[2], *code - OP_BRA);
code += 2;
}
else switch(*code)
{
case OP_OPT:
printf(" %.2x %s", code[1], OP_names[*code]);
code++;
break;
case OP_COND:
printf("%3d Cond", (code[1] << 8) + code[2]);
code += 2;
break;
case OP_CREF:
printf(" %.2d %s", code[1], OP_names[*code]);
code++;
break;
case OP_CHARS:
charlength = *(++code);
printf("%3d ", charlength);
while (charlength-- > 0)
if (isprint(c = *(++code))) printf("%c", c); else printf("\\x%02x", c);
break;
case OP_KETRMAX:
case OP_KETRMIN:
case OP_ALT:
case OP_KET:
case OP_ASSERT:
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
case OP_ONCE:
printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
code += 2;
break;
case OP_REVERSE:
printf("%3d %s", (code[1] << 8) + code[2], OP_names[*code]);
code += 2;
break;
case OP_STAR:
case OP_MINSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
if (*code >= OP_TYPESTAR)
printf(" %s", OP_names[code[1]]);
else if (isprint(c = code[1])) printf(" %c", c);
else printf(" \\x%02x", c);
printf("%s", OP_names[*code++]);
break;
case OP_EXACT:
case OP_UPTO:
case OP_MINUPTO:
if (isprint(c = code[3])) printf(" %c{", c);
else printf(" \\x%02x{", c);
if (*code != OP_EXACT) printf("0,");
printf("%d}", (code[1] << 8) + code[2]);
if (*code == OP_MINUPTO) printf("?");
code += 3;
break;
case OP_TYPEEXACT:
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
printf(" %s{", OP_names[code[3]]);
if (*code != OP_TYPEEXACT) printf(",");
printf("%d}", (code[1] << 8) + code[2]);
if (*code == OP_TYPEMINUPTO) printf("?");
code += 3;
break;
case OP_NOT:
if (isprint(c = *(++code))) printf(" [^%c]", c);
else printf(" [^\\x%02x]", c);
break;
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
if (isprint(c = code[1])) printf(" [^%c]", c);
else printf(" [^\\x%02x]", c);
printf("%s", OP_names[*code++]);
break;
case OP_NOTEXACT:
case OP_NOTUPTO:
case OP_NOTMINUPTO:
if (isprint(c = code[3])) printf(" [^%c]{", c);
else printf(" [^\\x%02x]{", c);
if (*code != OP_NOTEXACT) printf(",");
printf("%d}", (code[1] << 8) + code[2]);
if (*code == OP_NOTMINUPTO) printf("?");
code += 3;
break;
case OP_REF:
printf(" \\%d", *(++code));
code ++;
goto CLASS_REF_REPEAT;
case OP_CLASS:
{
int i, min, max;
code++;
printf(" [");
for (i = 0; i < 256; i++)
{
if ((code[i/8] & (1 << (i&7))) != 0)
{
int j;
for (j = i+1; j < 256; j++)
if ((code[j/8] & (1 << (j&7))) == 0) break;
if (i == '-' || i == ']') printf("\\");
if (isprint(i)) printf("%c", i); else printf("\\x%02x", i);
if (--j > i)
{
printf("-");
if (j == '-' || j == ']') printf("\\");
if (isprint(j)) printf("%c", j); else printf("\\x%02x", j);
}
i = j;
}
}
printf("]");
code += 32;
CLASS_REF_REPEAT:
switch(*code)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
printf("%s", OP_names[*code]);
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
min = (code[1] << 8) + code[2];
max = (code[3] << 8) + code[4];
if (max == 0) printf("{%d,}", min);
else printf("{%d,%d}", min, max);
if (*code == OP_CRMINRANGE) printf("?");
code += 4;
break;
default:
code--;
}
}
break;
/* Anything else is just a one-node item */
default:
printf(" %s", OP_names[*code]);
break;
}
code++;
printf("\n");
}
printf("------------------------------------------------------------------\n");
/* This check is done here in the debugging case so that the code that
was compiled can be seen. */
if (code - re->code > length)
{
*errorptr = ERR23;
(pcre_free)(re);
*erroroffset = ptr - (uschar *)pattern;
return NULL;
}
#endif
return (pcre *)re;
}
/*************************************************
* Match a back-reference *
*************************************************/
/* If a back reference hasn't been set, the length that is passed is greater
than the number of characters left in the string, so the match fails.
Arguments:
offset index into the offset vector
eptr points into the subject
length length to be matched
md points to match data block
ims the ims flags
Returns: TRUE if matched
*/
static BOOL
match_ref(int offset, register const uschar *eptr, int length, match_data *md,
unsigned long int ims)
{
const uschar *p = md->start_subject + md->offset_vector[offset];
#ifdef DEBUG
if (eptr >= md->end_subject)
printf("matching subject ");
else
{
printf("matching subject ");
pchars(eptr, length, TRUE, md);
}
printf(" against backref ");
pchars(p, length, FALSE, md);
printf("\n");
#endif
/* Always fail if not enough characters left */
if (length > md->end_subject - eptr) return FALSE;
/* Separate the caselesss case for speed */
if ((ims & PCRE_CASELESS) != 0)
{
while (length-- > 0)
if (md->lcc[*p++] != md->lcc[*eptr++]) return FALSE;
}
else
{ while (length-- > 0) if (*p++ != *eptr++) return FALSE; }
return TRUE;
}
/*************************************************
* Match from current position *
*************************************************/
/* On entry ecode points to the first opcode, and eptr to the first character
in the subject string, while eptrb holds the value of eptr at the start of the
last bracketed group - used for breaking infinite loops matching zero-length
strings.
Arguments:
eptr pointer in subject
ecode position in code
offset_top current top pointer
md pointer to "static" info for the match
ims current /i, /m, and /s options
eptrb pointer to chain of blocks containing eptr at start of
brackets - for testing for empty matches
flags can contain
match_condassert - this is an assertion condition
match_isgroup - this is the start of a bracketed group
Returns: TRUE if matched
*/
static BOOL
match(register const uschar *eptr, register const uschar *ecode,
int offset_top, match_data *md, unsigned long int ims, eptrblock *eptrb,
int flags)
{
unsigned long int original_ims = ims; /* Save for resetting on ')' */
eptrblock newptrb;
/* At the start of a bracketed group, add the current subject pointer to the
stack of such pointers, to be re-instated at the end of the group when we hit
the closing ket. When match() is called in other circumstances, we don't add to
the stack. */
if ((flags & match_isgroup) != 0)
{
newptrb.prev = eptrb;
newptrb.saved_eptr = eptr;
eptrb = &newptrb;
}
/* Now start processing the operations. */
for (;;)
{
int op = (int)*ecode;
int min, max, ctype;
register int i;
register int c;
BOOL minimize = FALSE;
/* Opening capturing bracket. If there is space in the offset vector, save
the current subject position in the working slot at the top of the vector. We
mustn't change the current values of the data slot, because they may be set
from a previous iteration of this group, and be referred to by a reference
inside the group.
If the bracket fails to match, we need to restore this value and also the
values of the final offsets, in case they were set by a previous iteration of
the same bracket.
If there isn't enough space in the offset vector, treat this as if it were a
non-capturing bracket. Don't worry about setting the flag for the error case
here; that is handled in the code for KET. */
if (op > OP_BRA)
{
int number = op - OP_BRA;
int offset = number << 1;
#ifdef DEBUG
printf("start bracket %d subject=", number);
pchars(eptr, 16, TRUE, md);
printf("\n");
#endif
if (offset < md->offset_max)
{
int save_offset1 = md->offset_vector[offset];
int save_offset2 = md->offset_vector[offset+1];
int save_offset3 = md->offset_vector[md->offset_end - number];
DPRINTF(("saving %d %d %d\n", save_offset1, save_offset2, save_offset3));
md->offset_vector[md->offset_end - number] = eptr - md->start_subject;
do
{
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
return TRUE;
ecode += (ecode[1] << 8) + ecode[2];
}
while (*ecode == OP_ALT);
DPRINTF(("bracket %d failed\n", number));
md->offset_vector[offset] = save_offset1;
md->offset_vector[offset+1] = save_offset2;
md->offset_vector[md->offset_end - number] = save_offset3;
return FALSE;
}
/* Insufficient room for saving captured contents */
else op = OP_BRA;
}
/* Other types of node can be handled by a switch */
switch(op)
{
case OP_BRA: /* Non-capturing bracket: optimized */
DPRINTF(("start bracket 0\n"));
do
{
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
return TRUE;
ecode += (ecode[1] << 8) + ecode[2];
}
while (*ecode == OP_ALT);
DPRINTF(("bracket 0 failed\n"));
return FALSE;
/* Conditional group: compilation checked that there are no more than
two branches. If the condition is false, skipping the first branch takes us
past the end if there is only one branch, but that's OK because that is
exactly what going to the ket would do. */
case OP_COND:
if (ecode[3] == OP_CREF) /* Condition is extraction test */
{
int offset = ecode[4] << 1; /* Doubled reference number */
return match(eptr,
ecode + ((offset < offset_top && md->offset_vector[offset] >= 0)?
5 : 3 + (ecode[1] << 8) + ecode[2]),
offset_top, md, ims, eptrb, match_isgroup);
}
/* The condition is an assertion. Call match() to evaluate it - setting
the final argument TRUE causes it to stop at the end of an assertion. */
else
{
if (match(eptr, ecode+3, offset_top, md, ims, NULL,
match_condassert | match_isgroup))
{
ecode += 3 + (ecode[4] << 8) + ecode[5];
while (*ecode == OP_ALT) ecode += (ecode[1] << 8) + ecode[2];
}
else ecode += (ecode[1] << 8) + ecode[2];
return match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup);
}
/* Control never reaches here */
/* Skip over conditional reference data if encountered (should not be) */
case OP_CREF:
ecode += 2;
break;
/* End of the pattern. If PCRE_NOTEMPTY is set, fail if we have matched
an empty string - recursion will then try other alternatives, if any. */
case OP_END:
if (md->notempty && eptr == md->start_match) return FALSE;
md->end_match_ptr = eptr; /* Record where we ended */
md->end_offset_top = offset_top; /* and how many extracts were taken */
return TRUE;
/* Change option settings */
case OP_OPT:
ims = ecode[1];
ecode += 2;
DPRINTF(("ims set to %02lx\n", ims));
break;
/* Assertion brackets. Check the alternative branches in turn - the
matching won't pass the KET for an assertion. If any one branch matches,
the assertion is true. Lookbehind assertions have an OP_REVERSE item at the
start of each branch to move the current point backwards, so the code at
this level is identical to the lookahead case. */
case OP_ASSERT:
case OP_ASSERTBACK:
do
{
if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup)) break;
ecode += (ecode[1] << 8) + ecode[2];
}
while (*ecode == OP_ALT);
if (*ecode == OP_KET) return FALSE;
/* If checking an assertion for a condition, return TRUE. */
if ((flags & match_condassert) != 0) return TRUE;
/* Continue from after the assertion, updating the offsets high water
mark, since extracts may have been taken during the assertion. */
do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
ecode += 3;
offset_top = md->end_offset_top;
continue;
/* Negative assertion: all branches must fail to match */
case OP_ASSERT_NOT:
case OP_ASSERTBACK_NOT:
do
{
if (match(eptr, ecode+3, offset_top, md, ims, NULL, match_isgroup))
return FALSE;
ecode += (ecode[1] << 8) + ecode[2];
}
while (*ecode == OP_ALT);
if ((flags & match_condassert) != 0) return TRUE;
ecode += 3;
continue;
/* Move the subject pointer back. This occurs only at the start of
each branch of a lookbehind assertion. If we are too close to the start to
move back, this match function fails. When working with UTF-8 we move
back a number of characters, not bytes. */
case OP_REVERSE:
#ifdef SUPPORT_UTF8
c = (ecode[1] << 8) + ecode[2];
for (i = 0; i < c; i++)
{
eptr--;
BACKCHAR(eptr)
}
#else
eptr -= (ecode[1] << 8) + ecode[2];
#endif
if (eptr < md->start_subject) return FALSE;
ecode += 3;
break;
/* Recursion matches the current regex, nested. If there are any capturing
brackets started but not finished, we have to save their starting points
and reinstate them after the recursion. However, we don't know how many
such there are (offset_top records the completed total) so we just have
to save all the potential data. There may be up to 99 such values, which
is a bit large to put on the stack, but using malloc for small numbers
seems expensive. As a compromise, the stack is used when there are fewer
than 16 values to store; otherwise malloc is used. A problem is what to do
if the malloc fails ... there is no way of returning to the top level with
an error. Save the top 15 values on the stack, and accept that the rest
may be wrong. */
case OP_RECURSE:
{
BOOL rc;
int *save;
int stacksave[15];
c = md->offset_max;
if (c < 16) save = stacksave; else
{
save = (int *)(pcre_malloc)((c+1) * sizeof(int));
if (save == NULL)
{
save = stacksave;
c = 15;
}
}
for (i = 1; i <= c; i++)
save[i] = md->offset_vector[md->offset_end - i];
rc = match(eptr, md->start_pattern, offset_top, md, ims, eptrb,
match_isgroup);
for (i = 1; i <= c; i++)
md->offset_vector[md->offset_end - i] = save[i];
if (save != stacksave) (pcre_free)(save);
if (!rc) return FALSE;
/* In case the recursion has set more capturing values, save the final
number, then move along the subject till after the recursive match,
and advance one byte in the pattern code. */
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
ecode++;
}
break;
/* "Once" brackets are like assertion brackets except that after a match,
the point in the subject string is not moved back. Thus there can never be
a move back into the brackets. Check the alternative branches in turn - the
matching won't pass the KET for this kind of subpattern. If any one branch
matches, we carry on as at the end of a normal bracket, leaving the subject
pointer. */
case OP_ONCE:
{
const uschar *prev = ecode;
const uschar *saved_eptr = eptr;
do
{
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, match_isgroup))
break;
ecode += (ecode[1] << 8) + ecode[2];
}
while (*ecode == OP_ALT);
/* If hit the end of the group (which could be repeated), fail */
if (*ecode != OP_ONCE && *ecode != OP_ALT) return FALSE;
/* Continue as from after the assertion, updating the offsets high water
mark, since extracts may have been taken. */
do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
offset_top = md->end_offset_top;
eptr = md->end_match_ptr;
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 3;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. We need to reset any options
that changed within the bracket before re-running it, so check the next
opcode. */
if (ecode[3] == OP_OPT)
{
ims = (ims & ~PCRE_IMS) | ecode[4];
DPRINTF(("ims set to %02lx at group repeat\n", ims));
}
if (*ecode == OP_KETRMIN)
{
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
return TRUE;
}
else /* OP_KETRMAX */
{
if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
}
}
return FALSE;
/* An alternation is the end of a branch; scan along to find the end of the
bracketed group and go to there. */
case OP_ALT:
do ecode += (ecode[1] << 8) + ecode[2]; while (*ecode == OP_ALT);
break;
/* BRAZERO and BRAMINZERO occur just before a bracket group, indicating
that it may occur zero times. It may repeat infinitely, or not at all -
i.e. it could be ()* or ()? in the pattern. Brackets with fixed upper
repeat limits are compiled as a number of copies, with the optional ones
preceded by BRAZERO or BRAMINZERO. */
case OP_BRAZERO:
{
const uschar *next = ecode+1;
if (match(eptr, next, offset_top, md, ims, eptrb, match_isgroup))
return TRUE;
do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
ecode = next + 3;
}
break;
case OP_BRAMINZERO:
{
const uschar *next = ecode+1;
do next += (next[1] << 8) + next[2]; while (*next == OP_ALT);
if (match(eptr, next+3, offset_top, md, ims, eptrb, match_isgroup))
return TRUE;
ecode++;
}
break;
/* End of a group, repeated or non-repeating. If we are at the end of
an assertion "group", stop matching and return TRUE, but record the
current high water mark for use by positive assertions. Do this also
for the "once" (not-backup up) groups. */
case OP_KET:
case OP_KETRMIN:
case OP_KETRMAX:
{
const uschar *prev = ecode - (ecode[1] << 8) - ecode[2];
const uschar *saved_eptr = eptrb->saved_eptr;
eptrb = eptrb->prev; /* Back up the stack of bracket start pointers */
if (*prev == OP_ASSERT || *prev == OP_ASSERT_NOT ||
*prev == OP_ASSERTBACK || *prev == OP_ASSERTBACK_NOT ||
*prev == OP_ONCE)
{
md->end_match_ptr = eptr; /* For ONCE */
md->end_offset_top = offset_top;
return TRUE;
}
/* In all other cases except a conditional group we have to check the
group number back at the start and if necessary complete handling an
extraction by setting the offsets and bumping the high water mark. */
if (*prev != OP_COND)
{
int number = *prev - OP_BRA;
int offset = number << 1;
#ifdef DEBUG
printf("end bracket %d", number);
printf("\n");
#endif
if (number > 0)
{
if (offset >= md->offset_max) md->offset_overflow = TRUE; else
{
md->offset_vector[offset] =
md->offset_vector[md->offset_end - number];
md->offset_vector[offset+1] = eptr - md->start_subject;
if (offset_top <= offset) offset_top = offset + 2;
}
}
}
/* Reset the value of the ims flags, in case they got changed during
the group. */
ims = original_ims;
DPRINTF(("ims reset to %02lx\n", ims));
/* For a non-repeating ket, just continue at this level. This also
happens for a repeating ket if no characters were matched in the group.
This is the forcible breaking of infinite loops as implemented in Perl
5.005. If there is an options reset, it will get obeyed in the normal
course of events. */
if (*ecode == OP_KET || eptr == saved_eptr)
{
ecode += 3;
break;
}
/* The repeating kets try the rest of the pattern or restart from the
preceding bracket, in the appropriate order. */
if (*ecode == OP_KETRMIN)
{
if (match(eptr, ecode+3, offset_top, md, ims, eptrb, 0) ||
match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup))
return TRUE;
}
else /* OP_KETRMAX */
{
if (match(eptr, prev, offset_top, md, ims, eptrb, match_isgroup) ||
match(eptr, ecode+3, offset_top, md, ims, eptrb, 0)) return TRUE;
}
}
return FALSE;
/* Start of subject unless notbol, or after internal newline if multiline */
case OP_CIRC:
if (md->notbol && eptr == md->start_subject) return FALSE;
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr != md->start_subject && eptr[-1] != '\n') return FALSE;
ecode++;
break;
}
/* ... else fall through */
/* Start of subject assertion */
case OP_SOD:
if (eptr != md->start_subject) return FALSE;
ecode++;
break;
/* Assert before internal newline if multiline, or before a terminating
newline unless endonly is set, else end of subject unless noteol is set. */
case OP_DOLL:
if ((ims & PCRE_MULTILINE) != 0)
{
if (eptr < md->end_subject) { if (*eptr != '\n') return FALSE; }
else { if (md->noteol) return FALSE; }
ecode++;
break;
}
else
{
if (md->noteol) return FALSE;
if (!md->endonly)
{
if (eptr < md->end_subject - 1 ||
(eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
ecode++;
break;
}
}
/* ... else fall through */
/* End of subject assertion (\z) */
case OP_EOD:
if (eptr < md->end_subject) return FALSE;
ecode++;
break;
/* End of subject or ending \n assertion (\Z) */
case OP_EODN:
if (eptr < md->end_subject - 1 ||
(eptr == md->end_subject - 1 && *eptr != '\n')) return FALSE;
ecode++;
break;
/* Word boundary assertions */
case OP_NOT_WORD_BOUNDARY:
case OP_WORD_BOUNDARY:
{
BOOL prev_is_word = (eptr != md->start_subject) &&
((md->ctypes[eptr[-1]] & ctype_word) != 0);
BOOL cur_is_word = (eptr < md->end_subject) &&
((md->ctypes[*eptr] & ctype_word) != 0);
if ((*ecode++ == OP_WORD_BOUNDARY)?
cur_is_word == prev_is_word : cur_is_word != prev_is_word)
return FALSE;
}
break;
/* Match a single character type; inline for speed */
case OP_ANY:
if ((ims & PCRE_DOTALL) == 0 && eptr < md->end_subject && *eptr == '\n')
return FALSE;
if (eptr++ >= md->end_subject) return FALSE;
#ifdef SUPPORT_UTF8
if (md->utf8)
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
#endif
ecode++;
break;
case OP_NOT_DIGIT:
if (eptr >= md->end_subject ||
(md->ctypes[*eptr++] & ctype_digit) != 0)
return FALSE;
ecode++;
break;
case OP_DIGIT:
if (eptr >= md->end_subject ||
(md->ctypes[*eptr++] & ctype_digit) == 0)
return FALSE;
ecode++;
break;
case OP_NOT_WHITESPACE:
if (eptr >= md->end_subject ||
(md->ctypes[*eptr++] & ctype_space) != 0)
return FALSE;
ecode++;
break;
case OP_WHITESPACE:
if (eptr >= md->end_subject ||
(md->ctypes[*eptr++] & ctype_space) == 0)
return FALSE;
ecode++;
break;
case OP_NOT_WORDCHAR:
if (eptr >= md->end_subject ||
(md->ctypes[*eptr++] & ctype_word) != 0)
return FALSE;
ecode++;
break;
case OP_WORDCHAR:
if (eptr >= md->end_subject ||
(md->ctypes[*eptr++] & ctype_word) == 0)
return FALSE;
ecode++;
break;
/* Match a back reference, possibly repeatedly. Look past the end of the
item to see if there is repeat information following. The code is similar
to that for character classes, but repeated for efficiency. Then obey
similar code to character type repeats - written out again for speed.
However, if the referenced string is the empty string, always treat
it as matched, any number of times (otherwise there could be infinite
loops). */
case OP_REF:
{
int length;
int offset = ecode[1] << 1; /* Doubled reference number */
ecode += 2; /* Advance past the item */
/* If the reference is unset, set the length to be longer than the amount
of subject left; this ensures that every attempt at a match fails. We
can't just fail here, because of the possibility of quantifiers with zero
minima. */
length = (offset >= offset_top || md->offset_vector[offset] < 0)?
md->end_subject - eptr + 1 :
md->offset_vector[offset+1] - md->offset_vector[offset];
/* Set up for repetition, or handle the non-repeated case */
switch (*ecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
c = *ecode++ - OP_CRSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*ecode == OP_CRMINRANGE);
min = (ecode[1] << 8) + ecode[2];
max = (ecode[3] << 8) + ecode[4];
if (max == 0) max = INT_MAX;
ecode += 5;
break;
default: /* No repeat follows */
if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
eptr += length;
continue; /* With the main loop */
}
/* If the length of the reference is zero, just continue with the
main loop. */
if (length == 0) continue;
/* First, ensure the minimum number of matches are present. We get back
the length of the reference string explicitly rather than passing the
address of eptr, so that eptr can be a register variable. */
for (i = 1; i <= min; i++)
{
if (!match_ref(offset, eptr, length, md, ims)) return FALSE;
eptr += length;
}
/* If min = max, continue at the same level without recursion.
They are not both allowed to be zero. */
if (min == max) continue;
/* If minimizing, keep trying and advancing the pointer */
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
if (i >= max || !match_ref(offset, eptr, length, md, ims))
return FALSE;
eptr += length;
}
/* Control never gets here */
}
/* If maximizing, find the longest string and work backwards */
else
{
const uschar *pp = eptr;
for (i = min; i < max; i++)
{
if (!match_ref(offset, eptr, length, md, ims)) break;
eptr += length;
}
while (eptr >= pp)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
eptr -= length;
}
return FALSE;
}
}
/* Control never gets here */
/* Match a character class, possibly repeatedly. Look past the end of the
item to see if there is repeat information following. Then obey similar
code to character type repeats - written out again for speed. */
case OP_CLASS:
{
const uschar *data = ecode + 1; /* Save for matching */
ecode += 33; /* Advance past the item */
switch (*ecode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRPLUS:
case OP_CRMINPLUS:
case OP_CRQUERY:
case OP_CRMINQUERY:
c = *ecode++ - OP_CRSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
minimize = (*ecode == OP_CRMINRANGE);
min = (ecode[1] << 8) + ecode[2];
max = (ecode[3] << 8) + ecode[4];
if (max == 0) max = INT_MAX;
ecode += 5;
break;
default: /* No repeat follows */
min = max = 1;
break;
}
/* First, ensure the minimum number of matches are present. */
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject) return FALSE;
GETCHARINC(c, eptr) /* Get character; increment eptr */
#ifdef SUPPORT_UTF8
/* We do not yet support class members > 255 */
if (c > 255) return FALSE;
#endif
if ((data[c/8] & (1 << (c&7))) != 0) continue;
return FALSE;
}
/* If max == min we can continue with the main loop without the
need to recurse. */
if (min == max) continue;
/* If minimizing, keep testing the rest of the expression and advancing
the pointer while it matches the class. */
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
if (i >= max || eptr >= md->end_subject) return FALSE;
GETCHARINC(c, eptr) /* Get character; increment eptr */
#ifdef SUPPORT_UTF8
/* We do not yet support class members > 255 */
if (c > 255) return FALSE;
#endif
if ((data[c/8] & (1 << (c&7))) != 0) continue;
return FALSE;
}
/* Control never gets here */
}
/* If maximizing, find the longest possible run, then work backwards. */
else
{
const uschar *pp = eptr;
int len = 1;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject) break;
GETCHARLEN(c, eptr, len) /* Get character, set length if UTF-8 */
#ifdef SUPPORT_UTF8
/* We do not yet support class members > 255 */
if (c > 255) break;
#endif
if ((data[c/8] & (1 << (c&7))) == 0) break;
eptr += len;
}
while (eptr >= pp)
{
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
#ifdef SUPPORT_UTF8
BACKCHAR(eptr)
#endif
}
return FALSE;
}
}
/* Control never gets here */
/* Match a run of characters */
case OP_CHARS:
{
register int length = ecode[1];
ecode += 2;
#ifdef DEBUG /* Sigh. Some compilers never learn. */
if (eptr >= md->end_subject)
printf("matching subject against pattern ");
else
{
printf("matching subject ");
pchars(eptr, length, TRUE, md);
printf(" against pattern ");
}
pchars(ecode, length, FALSE, md);
printf("\n");
#endif
if (length > md->end_subject - eptr) return FALSE;
if ((ims & PCRE_CASELESS) != 0)
{
while (length-- > 0)
if (md->lcc[*ecode++] != md->lcc[*eptr++])
return FALSE;
}
else
{
while (length-- > 0) if (*ecode++ != *eptr++) return FALSE;
}
}
break;
/* Match a single character repeatedly; different opcodes share code. */
case OP_EXACT:
min = max = (ecode[1] << 8) + ecode[2];
ecode += 3;
goto REPEATCHAR;
case OP_UPTO:
case OP_MINUPTO:
min = 0;
max = (ecode[1] << 8) + ecode[2];
minimize = *ecode == OP_MINUPTO;
ecode += 3;
goto REPEATCHAR;
case OP_STAR:
case OP_MINSTAR:
case OP_PLUS:
case OP_MINPLUS:
case OP_QUERY:
case OP_MINQUERY:
c = *ecode++ - OP_STAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
/* Common code for all repeated single-character matches. We can give
up quickly if there are fewer than the minimum number of characters left in
the subject. */
REPEATCHAR:
if (min > md->end_subject - eptr) return FALSE;
c = *ecode++;
/* The code is duplicated for the caseless and caseful cases, for speed,
since matching characters is likely to be quite common. First, ensure the
minimum number of matches are present. If min = max, continue at the same
level without recursing. Otherwise, if minimizing, keep trying the rest of
the expression and advancing one matching character if failing, up to the
maximum. Alternatively, if maximizing, find the maximum number of
characters and work backwards. */
DPRINTF(("matching %c{%d,%d} against subject %.*s\n", c, min, max,
max, eptr));
if ((ims & PCRE_CASELESS) != 0)
{
c = md->lcc[c];
for (i = 1; i <= min; i++)
if (c != md->lcc[*eptr++]) return FALSE;
if (min == max) continue;
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
if (i >= max || eptr >= md->end_subject ||
c != md->lcc[*eptr++])
return FALSE;
}
/* Control never gets here */
}
else
{
const uschar *pp = eptr;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || c != md->lcc[*eptr]) break;
eptr++;
}
while (eptr >= pp)
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
return FALSE;
}
/* Control never gets here */
}
/* Caseful comparisons */
else
{
for (i = 1; i <= min; i++) if (c != *eptr++) return FALSE;
if (min == max) continue;
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
if (i >= max || eptr >= md->end_subject || c != *eptr++) return FALSE;
}
/* Control never gets here */
}
else
{
const uschar *pp = eptr;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || c != *eptr) break;
eptr++;
}
while (eptr >= pp)
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
return FALSE;
}
}
/* Control never gets here */
/* Match a negated single character */
case OP_NOT:
if (eptr >= md->end_subject) return FALSE;
ecode++;
if ((ims & PCRE_CASELESS) != 0)
{
if (md->lcc[*ecode++] == md->lcc[*eptr++]) return FALSE;
}
else
{
if (*ecode++ == *eptr++) return FALSE;
}
break;
/* Match a negated single character repeatedly. This is almost a repeat of
the code for a repeated single character, but I haven't found a nice way of
commoning these up that doesn't require a test of the positive/negative
option for each character match. Maybe that wouldn't add very much to the
time taken, but character matching *is* what this is all about... */
case OP_NOTEXACT:
min = max = (ecode[1] << 8) + ecode[2];
ecode += 3;
goto REPEATNOTCHAR;
case OP_NOTUPTO:
case OP_NOTMINUPTO:
min = 0;
max = (ecode[1] << 8) + ecode[2];
minimize = *ecode == OP_NOTMINUPTO;
ecode += 3;
goto REPEATNOTCHAR;
case OP_NOTSTAR:
case OP_NOTMINSTAR:
case OP_NOTPLUS:
case OP_NOTMINPLUS:
case OP_NOTQUERY:
case OP_NOTMINQUERY:
c = *ecode++ - OP_NOTSTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
/* Common code for all repeated single-character matches. We can give
up quickly if there are fewer than the minimum number of characters left in
the subject. */
REPEATNOTCHAR:
if (min > md->end_subject - eptr) return FALSE;
c = *ecode++;
/* The code is duplicated for the caseless and caseful cases, for speed,
since matching characters is likely to be quite common. First, ensure the
minimum number of matches are present. If min = max, continue at the same
level without recursing. Otherwise, if minimizing, keep trying the rest of
the expression and advancing one matching character if failing, up to the
maximum. Alternatively, if maximizing, find the maximum number of
characters and work backwards. */
DPRINTF(("negative matching %c{%d,%d} against subject %.*s\n", c, min, max,
max, eptr));
if ((ims & PCRE_CASELESS) != 0)
{
c = md->lcc[c];
for (i = 1; i <= min; i++)
if (c == md->lcc[*eptr++]) return FALSE;
if (min == max) continue;
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
if (i >= max || eptr >= md->end_subject ||
c == md->lcc[*eptr++])
return FALSE;
}
/* Control never gets here */
}
else
{
const uschar *pp = eptr;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || c == md->lcc[*eptr]) break;
eptr++;
}
while (eptr >= pp)
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
return FALSE;
}
/* Control never gets here */
}
/* Caseful comparisons */
else
{
for (i = 1; i <= min; i++) if (c == *eptr++) return FALSE;
if (min == max) continue;
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
if (i >= max || eptr >= md->end_subject || c == *eptr++) return FALSE;
}
/* Control never gets here */
}
else
{
const uschar *pp = eptr;
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || c == *eptr) break;
eptr++;
}
while (eptr >= pp)
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
return FALSE;
}
}
/* Control never gets here */
/* Match a single character type repeatedly; several different opcodes
share code. This is very similar to the code for single characters, but we
repeat it in the interests of efficiency. */
case OP_TYPEEXACT:
min = max = (ecode[1] << 8) + ecode[2];
minimize = TRUE;
ecode += 3;
goto REPEATTYPE;
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
min = 0;
max = (ecode[1] << 8) + ecode[2];
minimize = *ecode == OP_TYPEMINUPTO;
ecode += 3;
goto REPEATTYPE;
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
c = *ecode++ - OP_TYPESTAR;
minimize = (c & 1) != 0;
min = rep_min[c]; /* Pick up values from tables; */
max = rep_max[c]; /* zero for max => infinity */
if (max == 0) max = INT_MAX;
/* Common code for all repeated single character type matches */
REPEATTYPE:
ctype = *ecode++; /* Code for the character type */
/* First, ensure the minimum number of matches are present. Use inline
code for maximizing the speed, and do the type test once at the start
(i.e. keep it out of the loop). Also we can test that there are at least
the minimum number of bytes before we start, except when doing '.' in
UTF8 mode. Leave the test in in all cases; in the special case we have
to test after each character. */
if (min > md->end_subject - eptr) return FALSE;
if (min > 0) switch(ctype)
{
case OP_ANY:
#ifdef SUPPORT_UTF8
if (md->utf8)
{
for (i = 1; i <= min; i++)
{
if (eptr >= md->end_subject ||
(*eptr++ == '\n' && (ims & PCRE_DOTALL) == 0))
return FALSE;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
break;
}
#endif
/* Non-UTF8 can be faster */
if ((ims & PCRE_DOTALL) == 0)
{ for (i = 1; i <= min; i++) if (*eptr++ == '\n') return FALSE; }
else eptr += min;
break;
case OP_NOT_DIGIT:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_digit) != 0) return FALSE;
break;
case OP_DIGIT:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_digit) == 0) return FALSE;
break;
case OP_NOT_WHITESPACE:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_space) != 0) return FALSE;
break;
case OP_WHITESPACE:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_space) == 0) return FALSE;
break;
case OP_NOT_WORDCHAR:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_word) != 0)
return FALSE;
break;
case OP_WORDCHAR:
for (i = 1; i <= min; i++)
if ((md->ctypes[*eptr++] & ctype_word) == 0)
return FALSE;
break;
}
/* If min = max, continue at the same level without recursing */
if (min == max) continue;
/* If minimizing, we have to test the rest of the pattern before each
subsequent match. */
if (minimize)
{
for (i = min;; i++)
{
if (match(eptr, ecode, offset_top, md, ims, eptrb, 0)) return TRUE;
if (i >= max || eptr >= md->end_subject) return FALSE;
c = *eptr++;
switch(ctype)
{
case OP_ANY:
if ((ims & PCRE_DOTALL) == 0 && c == '\n') return FALSE;
#ifdef SUPPORT_UTF8
if (md->utf8)
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
#endif
break;
case OP_NOT_DIGIT:
if ((md->ctypes[c] & ctype_digit) != 0) return FALSE;
break;
case OP_DIGIT:
if ((md->ctypes[c] & ctype_digit) == 0) return FALSE;
break;
case OP_NOT_WHITESPACE:
if ((md->ctypes[c] & ctype_space) != 0) return FALSE;
break;
case OP_WHITESPACE:
if ((md->ctypes[c] & ctype_space) == 0) return FALSE;
break;
case OP_NOT_WORDCHAR:
if ((md->ctypes[c] & ctype_word) != 0) return FALSE;
break;
case OP_WORDCHAR:
if ((md->ctypes[c] & ctype_word) == 0) return FALSE;
break;
}
}
/* Control never gets here */
}
/* If maximizing it is worth using inline code for speed, doing the type
test once at the start (i.e. keep it out of the loop). */
else
{
const uschar *pp = eptr;
switch(ctype)
{
case OP_ANY:
/* Special code is required for UTF8, but when the maximum is unlimited
we don't need it. */
#ifdef SUPPORT_UTF8
if (md->utf8 && max < INT_MAX)
{
if ((ims & PCRE_DOTALL) == 0)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || *eptr++ == '\n') break;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
else
{
for (i = min; i < max; i++)
{
eptr++;
while (eptr < md->end_subject && (*eptr & 0xc0) == 0x80) eptr++;
}
}
break;
}
#endif
/* Non-UTF8 can be faster */
if ((ims & PCRE_DOTALL) == 0)
{
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || *eptr == '\n') break;
eptr++;
}
}
else
{
c = max - min;
if (c > md->end_subject - eptr) c = md->end_subject - eptr;
eptr += c;
}
break;
case OP_NOT_DIGIT:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) != 0)
break;
eptr++;
}
break;
case OP_DIGIT:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_digit) == 0)
break;
eptr++;
}
break;
case OP_NOT_WHITESPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) != 0)
break;
eptr++;
}
break;
case OP_WHITESPACE:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_space) == 0)
break;
eptr++;
}
break;
case OP_NOT_WORDCHAR:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) != 0)
break;
eptr++;
}
break;
case OP_WORDCHAR:
for (i = min; i < max; i++)
{
if (eptr >= md->end_subject || (md->ctypes[*eptr] & ctype_word) == 0)
break;
eptr++;
}
break;
}
while (eptr >= pp)
{
if (match(eptr--, ecode, offset_top, md, ims, eptrb, 0))
return TRUE;
#ifdef SUPPORT_UTF8
if (md->utf8)
while (eptr > pp && (*eptr & 0xc0) == 0x80) eptr--;
#endif
}
return FALSE;
}
/* Control never gets here */
/* There's been some horrible disaster. */
default:
DPRINTF(("Unknown opcode %d\n", *ecode));
md->errorcode = PCRE_ERROR_UNKNOWN_NODE;
return FALSE;
}
/* Do not stick any code in here without much thought; it is assumed
that "continue" in the code above comes out to here to repeat the main
loop. */
} /* End of main loop */
/* Control never reaches here */
}
/*************************************************
* Execute a Regular Expression *
*************************************************/
/* This function applies a compiled re to a subject string and picks out
portions of the string if it matches. Two elements in the vector are set for
each substring: the offsets to the start and end of the substring.
Arguments:
external_re points to the compiled expression
external_extra points to "hints" from pcre_study() or is NULL
subject points to the subject string
length length of subject string (may contain binary zeros)
start_offset where to start in the subject string
options option bits
offsets points to a vector of ints to be filled in with offsets
offsetcount the number of elements in the vector
Returns: > 0 => success; value is the number of elements filled in
= 0 => success, but offsets is not big enough
-1 => failed to match
< -1 => some kind of unexpected problem
*/
int
pcre_exec(const pcre *external_re, const pcre_extra *external_extra,
const char *subject, int length, int start_offset, int options, int *offsets,
int offsetcount)
{
int resetcount, ocount;
int first_char = -1;
int req_char = -1;
int req_char2 = -1;
unsigned long int ims = 0;
match_data match_block;
const uschar *start_bits = NULL;
const uschar *start_match = (const uschar *)subject + start_offset;
const uschar *end_subject;
const uschar *req_char_ptr = start_match - 1;
const real_pcre *re = (const real_pcre *)external_re;
const real_pcre_extra *extra = (const real_pcre_extra *)external_extra;
BOOL using_temporary_offsets = FALSE;
BOOL anchored = ((re->options | options) & PCRE_ANCHORED) != 0;
BOOL startline = (re->options & PCRE_STARTLINE) != 0;
if ((options & ~PUBLIC_EXEC_OPTIONS) != 0) return PCRE_ERROR_BADOPTION;
if (re == NULL || subject == NULL ||
(offsets == NULL && offsetcount > 0)) return PCRE_ERROR_NULL;
if (re->magic_number != MAGIC_NUMBER) return PCRE_ERROR_BADMAGIC;
match_block.start_pattern = re->code;
match_block.start_subject = (const uschar *)subject;
match_block.end_subject = match_block.start_subject + length;
end_subject = match_block.end_subject;
match_block.endonly = (re->options & PCRE_DOLLAR_ENDONLY) != 0;
match_block.utf8 = (re->options & PCRE_UTF8) != 0;
match_block.notbol = (options & PCRE_NOTBOL) != 0;
match_block.noteol = (options & PCRE_NOTEOL) != 0;
match_block.notempty = (options & PCRE_NOTEMPTY) != 0;
match_block.errorcode = PCRE_ERROR_NOMATCH; /* Default error */
match_block.lcc = re->tables + lcc_offset;
match_block.ctypes = re->tables + ctypes_offset;
/* The ims options can vary during the matching as a result of the presence
of (?ims) items in the pattern. They are kept in a local variable so that
restoring at the exit of a group is easy. */
ims = re->options & (PCRE_CASELESS|PCRE_MULTILINE|PCRE_DOTALL);
/* If the expression has got more back references than the offsets supplied can
hold, we get a temporary bit of working store to use during the matching.
Otherwise, we can use the vector supplied, rounding down its size to a multiple
of 3. */
ocount = offsetcount - (offsetcount % 3);
if (re->top_backref > 0 && re->top_backref >= ocount/3)
{
ocount = re->top_backref * 3 + 3;
match_block.offset_vector = (int *)(pcre_malloc)(ocount * sizeof(int));
if (match_block.offset_vector == NULL) return PCRE_ERROR_NOMEMORY;
using_temporary_offsets = TRUE;
DPRINTF(("Got memory to hold back references\n"));
}
else match_block.offset_vector = offsets;
match_block.offset_end = ocount;
match_block.offset_max = (2*ocount)/3;
match_block.offset_overflow = FALSE;
/* Compute the minimum number of offsets that we need to reset each time. Doing
this makes a huge difference to execution time when there aren't many brackets
in the pattern. */
resetcount = 2 + re->top_bracket * 2;
if (resetcount > offsetcount) resetcount = ocount;
/* Reset the working variable associated with each extraction. These should
never be used unless previously set, but they get saved and restored, and so we
initialize them to avoid reading uninitialized locations. */
if (match_block.offset_vector != NULL)
{
register int *iptr = match_block.offset_vector + ocount;
register int *iend = iptr - resetcount/2 + 1;
while (--iptr >= iend) *iptr = -1;
}
/* Set up the first character to match, if available. The first_char value is
never set for an anchored regular expression, but the anchoring may be forced
at run time, so we have to test for anchoring. The first char may be unset for
an unanchored pattern, of course. If there's no first char and the pattern was
studied, there may be a bitmap of possible first characters. */
if (!anchored)
{
if ((re->options & PCRE_FIRSTSET) != 0)
{
first_char = re->first_char;
if ((ims & PCRE_CASELESS) != 0) first_char = match_block.lcc[first_char];
}
else
if (!startline && extra != NULL &&
(extra->options & PCRE_STUDY_MAPPED) != 0)
start_bits = extra->start_bits;
}
/* For anchored or unanchored matches, there may be a "last known required
character" set. If the PCRE_CASELESS is set, implying that the match starts
caselessly, or if there are any changes of this flag within the regex, set up
both cases of the character. Otherwise set the two values the same, which will
avoid duplicate testing (which takes significant time). This covers the vast
majority of cases. It will be suboptimal when the case flag changes in a regex
and the required character in fact is caseful. */
if ((re->options & PCRE_REQCHSET) != 0)
{
req_char = re->req_char;
req_char2 = ((re->options & (PCRE_CASELESS | PCRE_ICHANGED)) != 0)?
(re->tables + fcc_offset)[req_char] : req_char;
}
/* Loop for handling unanchored repeated matching attempts; for anchored regexs
the loop runs just once. */
do
{
int rc;
register int *iptr = match_block.offset_vector;
register int *iend = iptr + resetcount;
/* Reset the maximum number of extractions we might see. */
while (iptr < iend) *iptr++ = -1;
/* Advance to a unique first char if possible */
if (first_char >= 0)
{
if ((ims & PCRE_CASELESS) != 0)
while (start_match < end_subject &&
match_block.lcc[*start_match] != first_char)
start_match++;
else
while (start_match < end_subject && *start_match != first_char)
start_match++;
}
/* Or to just after \n for a multiline match if possible */
else if (startline)
{
if (start_match > match_block.start_subject + start_offset)
{
while (start_match < end_subject && start_match[-1] != '\n')
start_match++;
}
}
/* Or to a non-unique first char after study */
else if (start_bits != NULL)
{
while (start_match < end_subject)
{
register int c = *start_match;
if ((start_bits[c/8] & (1 << (c&7))) == 0) start_match++; else break;
}
}
#ifdef DEBUG /* Sigh. Some compilers never learn. */
printf(">>>> Match against: ");
pchars(start_match, end_subject - start_match, TRUE, &match_block);
printf("\n");
#endif
/* If req_char is set, we know that that character must appear in the subject
for the match to succeed. If the first character is set, req_char must be
later in the subject; otherwise the test starts at the match point. This
optimization can save a huge amount of backtracking in patterns with nested
unlimited repeats that aren't going to match. We don't know what the state of
case matching may be when this character is hit, so test for it in both its
cases if necessary. However, the different cased versions will not be set up
unless PCRE_CASELESS was given or the casing state changes within the regex.
Writing separate code makes it go faster, as does using an autoincrement and
backing off on a match. */
if (req_char >= 0)
{
register const uschar *p = start_match + ((first_char >= 0)? 1 : 0);
/* We don't need to repeat the search if we haven't yet reached the
place we found it at last time. */
if (p > req_char_ptr)
{
/* Do a single test if no case difference is set up */
if (req_char == req_char2)
{
while (p < end_subject)
{
if (*p++ == req_char) { p--; break; }
}
}
/* Otherwise test for either case */
else
{
while (p < end_subject)
{
register int pp = *p++;
if (pp == req_char || pp == req_char2) { p--; break; }
}
}
/* If we can't find the required character, break the matching loop */
if (p >= end_subject) break;
/* If we have found the required character, save the point where we
found it, so that we don't search again next time round the loop if
the start hasn't passed this character yet. */
req_char_ptr = p;
}
}
/* When a match occurs, substrings will be set for all internal extractions;
we just need to set up the whole thing as substring 0 before returning. If
there were too many extractions, set the return code to zero. In the case
where we had to get some local store to hold offsets for backreferences, copy
those back references that we can. In this case there need not be overflow
if certain parts of the pattern were not used. */
match_block.start_match = start_match;
if (!match(start_match, re->code, 2, &match_block, ims, NULL, match_isgroup))
continue;
/* Copy the offset information from temporary store if necessary */
if (using_temporary_offsets)
{
if (offsetcount >= 4)
{
memcpy(offsets + 2, match_block.offset_vector + 2,
(offsetcount - 2) * sizeof(int));
DPRINTF(("Copied offsets from temporary memory\n"));
}
if (match_block.end_offset_top > offsetcount)
match_block.offset_overflow = TRUE;
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(match_block.offset_vector);
}
rc = match_block.offset_overflow? 0 : match_block.end_offset_top/2;
if (match_block.offset_end < 2) rc = 0; else
{
offsets[0] = start_match - match_block.start_subject;
offsets[1] = match_block.end_match_ptr - match_block.start_subject;
}
DPRINTF((">>>> returning %d\n", rc));
return rc;
}
/* This "while" is the end of the "do" above */
while (!anchored &&
match_block.errorcode == PCRE_ERROR_NOMATCH &&
start_match++ < end_subject);
if (using_temporary_offsets)
{
DPRINTF(("Freeing temporary memory\n"));
(pcre_free)(match_block.offset_vector);
}
DPRINTF((">>>> returning %d\n", match_block.errorcode));
return match_block.errorcode;
}
/* End of pcre.c */
privoxy-3.0.21-stable/./pcre/pcreposix.h 000640 001751 001751 00000004445 10546014100 017076 0 ustar 00fk fk 000000 000000 /*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* Copyright (c) 1997-2000 University of Cambridge */
#ifndef _PCREPOSIX_H
#define _PCREPOSIX_H
/* This is the header for the POSIX wrapper interface to the PCRE Perl-
Compatible Regular Expression library. It defines the things POSIX says should
be there. I hope. */
/* Have to include stdlib.h in order to ensure that size_t is defined. */
#include
/* Allow for C++ users */
#ifdef __cplusplus
extern "C" {
#endif
/* Options defined by POSIX. */
#define REG_ICASE 0x01
#define REG_NEWLINE 0x02
#define REG_NOTBOL 0x04
#define REG_NOTEOL 0x08
/* These are not used by PCRE, but by defining them we make it easier
to slot PCRE into existing programs that make POSIX calls. */
#define REG_EXTENDED 0
#define REG_NOSUB 0
/* Error values. Not all these are relevant or used by the wrapper. */
enum {
REG_ASSERT = 1, /* internal error ? */
REG_BADBR, /* invalid repeat counts in {} */
REG_BADPAT, /* pattern error */
REG_BADRPT, /* ? * + invalid */
REG_EBRACE, /* unbalanced {} */
REG_EBRACK, /* unbalanced [] */
REG_ECOLLATE, /* collation error - not relevant */
REG_ECTYPE, /* bad class */
REG_EESCAPE, /* bad escape sequence */
REG_EMPTY, /* empty expression */
REG_EPAREN, /* unbalanced () */
REG_ERANGE, /* bad range inside [] */
REG_ESIZE, /* expression too big */
REG_ESPACE, /* failed to get memory */
REG_ESUBREG, /* bad back reference */
REG_INVARG, /* bad argument */
REG_NOMATCH /* match failed */
};
/* The structure representing a compiled regular expression. */
typedef struct {
void *re_pcre;
size_t re_nsub;
size_t re_erroffset;
} regex_t;
/* The structure in which a captured offset is returned. */
typedef int regoff_t;
typedef struct {
regoff_t rm_so;
regoff_t rm_eo;
} regmatch_t;
/* The functions */
extern int regcomp(regex_t *, const char *, int);
extern int regexec(regex_t *, const char *, size_t, regmatch_t *, int);
extern size_t regerror(int, const regex_t *, char *, size_t);
extern void regfree(regex_t *);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* End of pcreposix.h */
privoxy-3.0.21-stable/./pcre/get.c 000640 001751 001751 00000016401 10546014100 015627 0 ustar 00fk fk 000000 000000 /*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/*
This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
the file Tech.Notes for some information on the internals.
Written by: Philip Hazel
Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
-----------------------------------------------------------------------------
*/
/* This module contains some convenience functions for extracting substrings
from the subject string after a regex match has succeeded. The original idea
for these functions came from Scott Wimer . */
/* Include the internals header, which itself includes Standard C headers plus
the external pcre header. */
#include "internal.h"
/*************************************************
* Copy captured string to given buffer *
*************************************************/
/* This function copies a single captured substring into a given buffer.
Note that we use memcpy() rather than strncpy() in case there are binary zeros
in the string.
Arguments:
subject the subject string that was matched
ovector pointer to the offsets table
stringcount the number of substrings that were captured
(i.e. the yield of the pcre_exec call, unless
that was zero, in which case it should be 1/3
of the offset table size)
stringnumber the number of the required substring
buffer where to put the substring
size the size of the buffer
Returns: if successful:
the length of the copied string, not including the zero
that is put on the end; can be zero
if not successful:
PCRE_ERROR_NOMEMORY (-6) buffer too small
PCRE_ERROR_NOSUBSTRING (-7) no such captured substring
*/
int
pcre_copy_substring(const char *subject, int *ovector, int stringcount,
int stringnumber, char *buffer, int size)
{
int yield;
if (stringnumber < 0 || stringnumber >= stringcount)
return PCRE_ERROR_NOSUBSTRING;
stringnumber *= 2;
yield = ovector[stringnumber+1] - ovector[stringnumber];
if (size < yield + 1) return PCRE_ERROR_NOMEMORY;
memcpy(buffer, subject + ovector[stringnumber], yield);
buffer[yield] = 0;
return yield;
}
/*************************************************
* Copy all captured strings to new store *
*************************************************/
/* This function gets one chunk of store and builds a list of pointers and all
of the captured substrings in it. A NULL pointer is put on the end of the list.
Arguments:
subject the subject string that was matched
ovector pointer to the offsets table
stringcount the number of substrings that were captured
(i.e. the yield of the pcre_exec call, unless
that was zero, in which case it should be 1/3
of the offset table size)
listptr set to point to the list of pointers
Returns: if successful: 0
if not successful:
PCRE_ERROR_NOMEMORY (-6) failed to get store
*/
int
pcre_get_substring_list(const char *subject, int *ovector, int stringcount,
const char ***listptr)
{
int i;
int size = sizeof(char *);
int double_count = stringcount * 2;
char **stringlist;
char *p;
for (i = 0; i < double_count; i += 2)
size += sizeof(char *) + ovector[i+1] - ovector[i] + 1;
stringlist = (char **)(pcre_malloc)(size);
if (stringlist == NULL) return PCRE_ERROR_NOMEMORY;
*listptr = (const char **)stringlist;
p = (char *)(stringlist + stringcount + 1);
for (i = 0; i < double_count; i += 2)
{
int len = ovector[i+1] - ovector[i];
memcpy(p, subject + ovector[i], len);
*stringlist++ = p;
p += len;
*p++ = 0;
}
*stringlist = NULL;
return 0;
}
/*************************************************
* Free store obtained by get_substring_list *
*************************************************/
/* This function exists for the benefit of people calling PCRE from non-C
programs that can call its functions, but not free() or (pcre_free)() directly.
Argument: the result of a previous pcre_get_substring_list()
Returns: nothing
*/
void
pcre_free_substring_list(const char **pointer)
{
(pcre_free)((void *)pointer);
}
/*************************************************
* Copy captured string to new store *
*************************************************/
/* This function copies a single captured substring into a piece of new
store
Arguments:
subject the subject string that was matched
ovector pointer to the offsets table
stringcount the number of substrings that were captured
(i.e. the yield of the pcre_exec call, unless
that was zero, in which case it should be 1/3
of the offset table size)
stringnumber the number of the required substring
stringptr where to put a pointer to the substring
Returns: if successful:
the length of the string, not including the zero that
is put on the end; can be zero
if not successful:
PCRE_ERROR_NOMEMORY (-6) failed to get store
PCRE_ERROR_NOSUBSTRING (-7) substring not present
*/
int
pcre_get_substring(const char *subject, int *ovector, int stringcount,
int stringnumber, const char **stringptr)
{
int yield;
char *substring;
if (stringnumber < 0 || stringnumber >= stringcount)
return PCRE_ERROR_NOSUBSTRING;
stringnumber *= 2;
yield = ovector[stringnumber+1] - ovector[stringnumber];
substring = (char *)(pcre_malloc)(yield + 1);
if (substring == NULL) return PCRE_ERROR_NOMEMORY;
memcpy(substring, subject + ovector[stringnumber], yield);
substring[yield] = 0;
*stringptr = substring;
return yield;
}
/*************************************************
* Free store obtained by get_substring *
*************************************************/
/* This function exists for the benefit of people calling PCRE from non-C
programs that can call its functions, but not free() or (pcre_free)() directly.
Argument: the result of a previous pcre_get_substring()
Returns: nothing
*/
void
pcre_free_substring(const char *pointer)
{
(pcre_free)((void *)pointer);
}
/* End of get.c */
privoxy-3.0.21-stable/./pcre/dll.mk 000640 001751 001751 00000003667 10546014100 016022 0 ustar 00fk fk 000000 000000 # dll.mk - auxilary Makefile to easy build dll's for mingw32 target
# ver. 0.6 of 1999-03-25
#
# Homepage of this makefile - http://www.is.lg.ua/~paul/devel/
# Homepage of original mingw32 project -
# http://www.fu.is.saga-u.ac.jp/~colin/gcc.html
#
# How to use:
# This makefile can:
# 1. Create automatical .def file from list of objects
# 2. Create .dll from objects and .def file, either automatical, or your
# hand-written (maybe) file, which must have same basename as dll
# WARNING! There MUST be object, which name match dll's name. Make sux.
# 3. Create import library from .def (as for .dll, only its name required,
# not dll itself)
# By convention implibs for dll have .dll.a suffix, e.g. libstuff.dll.a
# Why not just libstuff.a? 'Cos that's name for static lib, ok?
# Process divided into 3 phases because:
# 1. Pre-existent .def possible
# 2. Generating implib is enough time-consuming
#
# Variables:
# DLL_LDLIBS - libs for linking dll
# DLL_LDFLAGS - flags for linking dll
#
# By using $(DLL_SUFFIX) instead of 'dll', e.g. stuff.$(DLL_SUFFIX)
# you may help porting makefiles to other platforms
#
# Put this file in your make's include path (e.g. main include dir, for
# more information see include section in make doc). Put in the beginning
# of your own Makefile line "include dll.mk". Specify dependences, e.g.:
#
# Do all stuff in one step
# libstuff.dll.a: $(OBJECTS) stuff.def
# stuff.def: $(OBJECTS)
#
# Steps separated, pre-provided .def, link with user32
#
# DLL_LDLIBS=-luser32
# stuff.dll: $(OBJECTS)
# libstuff.dll.a: $(OBJECTS)
DLLWRAP=dllwrap
DLLTOOL=dlltool
DLL_SUFFIX=dll
.SUFFIXES: .o .$(DLL_SUFFIX)
_%.def: %.o
$(DLLTOOL) --export-all --output-def $@ $^
%.$(DLL_SUFFIX): %.o
$(DLLWRAP) --dllname $(notdir $@) --driver-name $(CC) --def $*.def -o $@ $(filter %.o,$^) $(DLL_LDFLAGS) $(DLL_LDLIBS)
lib%.$(DLL_SUFFIX).a:%.def
$(DLLTOOL) --dllname $(notdir $*.dll) --def $< --output-lib $@
# End
privoxy-3.0.21-stable/./pcre/configure 000640 001751 001751 00000134707 10546014100 016622 0 ustar 00fk fk 000000 000000 #! /bin/sh
# Guess values for system-dependent variables and create Makefiles.
# Generated automatically using autoconf version 2.13
# Copyright (C) 1992, 93, 94, 95, 96 Free Software Foundation, Inc.
#
# This configure script is free software; the Free Software Foundation
# gives unlimited permission to copy, distribute and modify it.
# Defaults:
ac_help=
ac_default_prefix=/usr/local
# Any additions from configure.in:
ac_help="$ac_help
--disable-shared build PCRE as a static library"
ac_help="$ac_help
--enable-utf8 enable UTF8 support (incomplete)"
# Initialize some variables set by options.
# The variables have the same names as the options, with
# dashes changed to underlines.
build=NONE
cache_file=./config.cache
exec_prefix=NONE
host=NONE
no_create=
nonopt=NONE
no_recursion=
prefix=NONE
program_prefix=NONE
program_suffix=NONE
program_transform_name=s,x,x,
silent=
site=
srcdir=
target=NONE
verbose=
x_includes=NONE
x_libraries=NONE
bindir='${exec_prefix}/bin'
sbindir='${exec_prefix}/sbin'
libexecdir='${exec_prefix}/libexec'
datadir='${prefix}/share'
sysconfdir='${prefix}/etc'
sharedstatedir='${prefix}/com'
localstatedir='${prefix}/var'
libdir='${exec_prefix}/lib'
includedir='${prefix}/include'
oldincludedir='/usr/include'
infodir='${prefix}/info'
mandir='${prefix}/man'
# Initialize some other variables.
subdirs=
MFLAGS= MAKEFLAGS=
SHELL=${CONFIG_SHELL-/bin/sh}
# Maximum number of lines to put in a shell here document.
ac_max_here_lines=12
ac_prev=
for ac_option
do
# If the previous option needs an argument, assign it.
if test -n "$ac_prev"; then
eval "$ac_prev=\$ac_option"
ac_prev=
continue
fi
case "$ac_option" in
-*=*) ac_optarg=`echo "$ac_option" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
*) ac_optarg= ;;
esac
# Accept the important Cygnus configure options, so we can diagnose typos.
case "$ac_option" in
-bindir | --bindir | --bindi | --bind | --bin | --bi)
ac_prev=bindir ;;
-bindir=* | --bindir=* | --bindi=* | --bind=* | --bin=* | --bi=*)
bindir="$ac_optarg" ;;
-build | --build | --buil | --bui | --bu)
ac_prev=build ;;
-build=* | --build=* | --buil=* | --bui=* | --bu=*)
build="$ac_optarg" ;;
-cache-file | --cache-file | --cache-fil | --cache-fi \
| --cache-f | --cache- | --cache | --cach | --cac | --ca | --c)
ac_prev=cache_file ;;
-cache-file=* | --cache-file=* | --cache-fil=* | --cache-fi=* \
| --cache-f=* | --cache-=* | --cache=* | --cach=* | --cac=* | --ca=* | --c=*)
cache_file="$ac_optarg" ;;
-datadir | --datadir | --datadi | --datad | --data | --dat | --da)
ac_prev=datadir ;;
-datadir=* | --datadir=* | --datadi=* | --datad=* | --data=* | --dat=* \
| --da=*)
datadir="$ac_optarg" ;;
-disable-* | --disable-*)
ac_feature=`echo $ac_option|sed -e 's/-*disable-//'`
# Reject names that are not valid shell variable names.
if test -n "`echo $ac_feature| sed 's/[-a-zA-Z0-9_]//g'`"; then
{ echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; }
fi
ac_feature=`echo $ac_feature| sed 's/-/_/g'`
eval "enable_${ac_feature}=no" ;;
-enable-* | --enable-*)
ac_feature=`echo $ac_option|sed -e 's/-*enable-//' -e 's/=.*//'`
# Reject names that are not valid shell variable names.
if test -n "`echo $ac_feature| sed 's/[-_a-zA-Z0-9]//g'`"; then
{ echo "configure: error: $ac_feature: invalid feature name" 1>&2; exit 1; }
fi
ac_feature=`echo $ac_feature| sed 's/-/_/g'`
case "$ac_option" in
*=*) ;;
*) ac_optarg=yes ;;
esac
eval "enable_${ac_feature}='$ac_optarg'" ;;
-exec-prefix | --exec_prefix | --exec-prefix | --exec-prefi \
| --exec-pref | --exec-pre | --exec-pr | --exec-p | --exec- \
| --exec | --exe | --ex)
ac_prev=exec_prefix ;;
-exec-prefix=* | --exec_prefix=* | --exec-prefix=* | --exec-prefi=* \
| --exec-pref=* | --exec-pre=* | --exec-pr=* | --exec-p=* | --exec-=* \
| --exec=* | --exe=* | --ex=*)
exec_prefix="$ac_optarg" ;;
-gas | --gas | --ga | --g)
# Obsolete; use --with-gas.
with_gas=yes ;;
-help | --help | --hel | --he)
# Omit some internal or obsolete options to make the list less imposing.
# The list generated by autoconf has been trimmed to remove many
# options that are totally irrelevant to PCRE (e.g. relating to X),
# or are not supported by its Makefile.
# The list generated by autoconf has been trimmed to remove many
# options that are totally irrelevant to PCRE (e.g. relating to X),
# or are not supported by its Makefile.
# The list generated by autoconf has been trimmed to remove many
# options that are totally irrelevant to PCRE (e.g. relating to X),
# or are not supported by its Makefile.
# This message is too long to be a string in the A/UX 3.1 sh.
cat << EOF
Usage: ./configure [options]
Options: [defaults in brackets after descriptions]
Configuration:
--cache-file=FILE cache test results in FILE
--help print this message
--no-create do not create output files
--quiet, --silent do not print \`checking...' messages
--version print the version of autoconf that created configure
Directory and file names:
--prefix=PREFIX install architecture-independent files in PREFIX
[$ac_default_prefix]
--exec-prefix=EPREFIX install architecture-dependent files in EPREFIX
[same as prefix]
--bindir=DIR user executables in DIR [EPREFIX/bin]
--libdir=DIR object code libraries in DIR [EPREFIX/lib]
--includedir=DIR C header files in DIR [PREFIX/include]
--mandir=DIR man documentation in DIR [PREFIX/man]
EOF
cat << EOF
EOF
if test -n "$ac_help"; then
echo "--enable and --with options recognized:$ac_help"
fi
exit 0 ;;
-host | --host | --hos | --ho)
ac_prev=host ;;
-host=* | --host=* | --hos=* | --ho=*)
host="$ac_optarg" ;;
-includedir | --includedir | --includedi | --included | --include \
| --includ | --inclu | --incl | --inc)
ac_prev=includedir ;;
-includedir=* | --includedir=* | --includedi=* | --included=* | --include=* \
| --includ=* | --inclu=* | --incl=* | --inc=*)
includedir="$ac_optarg" ;;
-infodir | --infodir | --infodi | --infod | --info | --inf)
ac_prev=infodir ;;
-infodir=* | --infodir=* | --infodi=* | --infod=* | --info=* | --inf=*)
infodir="$ac_optarg" ;;
-libdir | --libdir | --libdi | --libd)
ac_prev=libdir ;;
-libdir=* | --libdir=* | --libdi=* | --libd=*)
libdir="$ac_optarg" ;;
-libexecdir | --libexecdir | --libexecdi | --libexecd | --libexec \
| --libexe | --libex | --libe)
ac_prev=libexecdir ;;
-libexecdir=* | --libexecdir=* | --libexecdi=* | --libexecd=* | --libexec=* \
| --libexe=* | --libex=* | --libe=*)
libexecdir="$ac_optarg" ;;
-localstatedir | --localstatedir | --localstatedi | --localstated \
| --localstate | --localstat | --localsta | --localst \
| --locals | --local | --loca | --loc | --lo)
ac_prev=localstatedir ;;
-localstatedir=* | --localstatedir=* | --localstatedi=* | --localstated=* \
| --localstate=* | --localstat=* | --localsta=* | --localst=* \
| --locals=* | --local=* | --loca=* | --loc=* | --lo=*)
localstatedir="$ac_optarg" ;;
-mandir | --mandir | --mandi | --mand | --man | --ma | --m)
ac_prev=mandir ;;
-mandir=* | --mandir=* | --mandi=* | --mand=* | --man=* | --ma=* | --m=*)
mandir="$ac_optarg" ;;
-nfp | --nfp | --nf)
# Obsolete; use --without-fp.
with_fp=no ;;
-no-create | --no-create | --no-creat | --no-crea | --no-cre \
| --no-cr | --no-c)
no_create=yes ;;
-no-recursion | --no-recursion | --no-recursio | --no-recursi \
| --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r)
no_recursion=yes ;;
-oldincludedir | --oldincludedir | --oldincludedi | --oldincluded \
| --oldinclude | --oldinclud | --oldinclu | --oldincl | --oldinc \
| --oldin | --oldi | --old | --ol | --o)
ac_prev=oldincludedir ;;
-oldincludedir=* | --oldincludedir=* | --oldincludedi=* | --oldincluded=* \
| --oldinclude=* | --oldinclud=* | --oldinclu=* | --oldincl=* | --oldinc=* \
| --oldin=* | --oldi=* | --old=* | --ol=* | --o=*)
oldincludedir="$ac_optarg" ;;
-prefix | --prefix | --prefi | --pref | --pre | --pr | --p)
ac_prev=prefix ;;
-prefix=* | --prefix=* | --prefi=* | --pref=* | --pre=* | --pr=* | --p=*)
prefix="$ac_optarg" ;;
-program-prefix | --program-prefix | --program-prefi | --program-pref \
| --program-pre | --program-pr | --program-p)
ac_prev=program_prefix ;;
-program-prefix=* | --program-prefix=* | --program-prefi=* \
| --program-pref=* | --program-pre=* | --program-pr=* | --program-p=*)
program_prefix="$ac_optarg" ;;
-program-suffix | --program-suffix | --program-suffi | --program-suff \
| --program-suf | --program-su | --program-s)
ac_prev=program_suffix ;;
-program-suffix=* | --program-suffix=* | --program-suffi=* \
| --program-suff=* | --program-suf=* | --program-su=* | --program-s=*)
program_suffix="$ac_optarg" ;;
-program-transform-name | --program-transform-name \
| --program-transform-nam | --program-transform-na \
| --program-transform-n | --program-transform- \
| --program-transform | --program-transfor \
| --program-transfo | --program-transf \
| --program-trans | --program-tran \
| --progr-tra | --program-tr | --program-t)
ac_prev=program_transform_name ;;
-program-transform-name=* | --program-transform-name=* \
| --program-transform-nam=* | --program-transform-na=* \
| --program-transform-n=* | --program-transform-=* \
| --program-transform=* | --program-transfor=* \
| --program-transfo=* | --program-transf=* \
| --program-trans=* | --program-tran=* \
| --progr-tra=* | --program-tr=* | --program-t=*)
program_transform_name="$ac_optarg" ;;
-q | -quiet | --quiet | --quie | --qui | --qu | --q \
| -silent | --silent | --silen | --sile | --sil)
silent=yes ;;
-sbindir | --sbindir | --sbindi | --sbind | --sbin | --sbi | --sb)
ac_prev=sbindir ;;
-sbindir=* | --sbindir=* | --sbindi=* | --sbind=* | --sbin=* \
| --sbi=* | --sb=*)
sbindir="$ac_optarg" ;;
-sharedstatedir | --sharedstatedir | --sharedstatedi \
| --sharedstated | --sharedstate | --sharedstat | --sharedsta \
| --sharedst | --shareds | --shared | --share | --shar \
| --sha | --sh)
ac_prev=sharedstatedir ;;
-sharedstatedir=* | --sharedstatedir=* | --sharedstatedi=* \
| --sharedstated=* | --sharedstate=* | --sharedstat=* | --sharedsta=* \
| --sharedst=* | --shareds=* | --shared=* | --share=* | --shar=* \
| --sha=* | --sh=*)
sharedstatedir="$ac_optarg" ;;
-site | --site | --sit)
ac_prev=site ;;
-site=* | --site=* | --sit=*)
site="$ac_optarg" ;;
-srcdir | --srcdir | --srcdi | --srcd | --src | --sr)
ac_prev=srcdir ;;
-srcdir=* | --srcdir=* | --srcdi=* | --srcd=* | --src=* | --sr=*)
srcdir="$ac_optarg" ;;
-sysconfdir | --sysconfdir | --sysconfdi | --sysconfd | --sysconf \
| --syscon | --sysco | --sysc | --sys | --sy)
ac_prev=sysconfdir ;;
-sysconfdir=* | --sysconfdir=* | --sysconfdi=* | --sysconfd=* | --sysconf=* \
| --syscon=* | --sysco=* | --sysc=* | --sys=* | --sy=*)
sysconfdir="$ac_optarg" ;;
-target | --target | --targe | --targ | --tar | --ta | --t)
ac_prev=target ;;
-target=* | --target=* | --targe=* | --targ=* | --tar=* | --ta=* | --t=*)
target="$ac_optarg" ;;
-v | -verbose | --verbose | --verbos | --verbo | --verb)
verbose=yes ;;
-version | --version | --versio | --versi | --vers)
echo "configure generated by autoconf version 2.13"
exit 0 ;;
-with-* | --with-*)
ac_package=`echo $ac_option|sed -e 's/-*with-//' -e 's/=.*//'`
# Reject names that are not valid shell variable names.
if test -n "`echo $ac_package| sed 's/[-_a-zA-Z0-9]//g'`"; then
{ echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; }
fi
ac_package=`echo $ac_package| sed 's/-/_/g'`
case "$ac_option" in
*=*) ;;
*) ac_optarg=yes ;;
esac
eval "with_${ac_package}='$ac_optarg'" ;;
-without-* | --without-*)
ac_package=`echo $ac_option|sed -e 's/-*without-//'`
# Reject names that are not valid shell variable names.
if test -n "`echo $ac_package| sed 's/[-a-zA-Z0-9_]//g'`"; then
{ echo "configure: error: $ac_package: invalid package name" 1>&2; exit 1; }
fi
ac_package=`echo $ac_package| sed 's/-/_/g'`
eval "with_${ac_package}=no" ;;
--x)
# Obsolete; use --with-x.
with_x=yes ;;
-x-includes | --x-includes | --x-include | --x-includ | --x-inclu \
| --x-incl | --x-inc | --x-in | --x-i)
ac_prev=x_includes ;;
-x-includes=* | --x-includes=* | --x-include=* | --x-includ=* | --x-inclu=* \
| --x-incl=* | --x-inc=* | --x-in=* | --x-i=*)
x_includes="$ac_optarg" ;;
-x-libraries | --x-libraries | --x-librarie | --x-librari \
| --x-librar | --x-libra | --x-libr | --x-lib | --x-li | --x-l)
ac_prev=x_libraries ;;
-x-libraries=* | --x-libraries=* | --x-librarie=* | --x-librari=* \
| --x-librar=* | --x-libra=* | --x-libr=* | --x-lib=* | --x-li=* | --x-l=*)
x_libraries="$ac_optarg" ;;
-*) { echo "configure: error: $ac_option: invalid option; use --help to show usage" 1>&2; exit 1; }
;;
*)
if test -n "`echo $ac_option| sed 's/[-a-z0-9.]//g'`"; then
echo "configure: warning: $ac_option: invalid host type" 1>&2
fi
if test "x$nonopt" != xNONE; then
{ echo "configure: error: can only configure for one host and one target at a time" 1>&2; exit 1; }
fi
nonopt="$ac_option"
;;
esac
done
if test -n "$ac_prev"; then
{ echo "configure: error: missing argument to --`echo $ac_prev | sed 's/_/-/g'`" 1>&2; exit 1; }
fi
trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
# File descriptor usage:
# 0 standard input
# 1 file creation
# 2 errors and warnings
# 3 some systems may open it to /dev/tty
# 4 used on the Kubota Titan
# 6 checking for... messages and results
# 5 compiler messages saved in config.log
if test "$silent" = yes; then
exec 6>/dev/null
else
exec 6>&1
fi
exec 5>./config.log
echo "\
This file contains any messages produced by compilers while
running configure, to aid debugging if configure makes a mistake.
" 1>&5
# Strip out --no-create and --no-recursion so they do not pile up.
# Also quote any args containing shell metacharacters.
ac_configure_args=
for ac_arg
do
case "$ac_arg" in
-no-create | --no-create | --no-creat | --no-crea | --no-cre \
| --no-cr | --no-c) ;;
-no-recursion | --no-recursion | --no-recursio | --no-recursi \
| --no-recurs | --no-recur | --no-recu | --no-rec | --no-re | --no-r) ;;
*" "*|*" "*|*[\[\]\~\#\$\^\&\*\(\)\{\}\\\|\;\<\>\?]*)
ac_configure_args="$ac_configure_args '$ac_arg'" ;;
*) ac_configure_args="$ac_configure_args $ac_arg" ;;
esac
done
# NLS nuisances.
# Only set these to C if already set. These must not be set unconditionally
# because not all systems understand e.g. LANG=C (notably SCO).
# Fixing LC_MESSAGES prevents Solaris sh from translating var values in `set'!
# Non-C LC_CTYPE values break the ctype check.
if test "${LANG+set}" = set; then LANG=C; export LANG; fi
if test "${LC_ALL+set}" = set; then LC_ALL=C; export LC_ALL; fi
if test "${LC_MESSAGES+set}" = set; then LC_MESSAGES=C; export LC_MESSAGES; fi
if test "${LC_CTYPE+set}" = set; then LC_CTYPE=C; export LC_CTYPE; fi
# confdefs.h avoids OS command line length limits that DEFS can exceed.
rm -rf conftest* confdefs.h
# AIX cpp loses on an empty file, so make sure it contains at least a newline.
echo > confdefs.h
# A filename unique to this package, relative to the directory that
# configure is in, which we can look for to find out if srcdir is correct.
ac_unique_file=dftables.c
# Find the source files, if location was not specified.
if test -z "$srcdir"; then
ac_srcdir_defaulted=yes
# Try the directory containing this script, then its parent.
ac_prog=$0
ac_confdir=`echo $ac_prog|sed 's%/[^/][^/]*$%%'`
test "x$ac_confdir" = "x$ac_prog" && ac_confdir=.
srcdir=$ac_confdir
if test ! -r $srcdir/$ac_unique_file; then
srcdir=..
fi
else
ac_srcdir_defaulted=no
fi
if test ! -r $srcdir/$ac_unique_file; then
if test "$ac_srcdir_defaulted" = yes; then
{ echo "configure: error: can not find sources in $ac_confdir or .." 1>&2; exit 1; }
else
{ echo "configure: error: can not find sources in $srcdir" 1>&2; exit 1; }
fi
fi
srcdir=`echo "${srcdir}" | sed 's%\([^/]\)/*$%\1%'`
# Prefer explicitly selected file to automatically selected ones.
if test -z "$CONFIG_SITE"; then
if test "x$prefix" != xNONE; then
CONFIG_SITE="$prefix/share/config.site $prefix/etc/config.site"
else
CONFIG_SITE="$ac_default_prefix/share/config.site $ac_default_prefix/etc/config.site"
fi
fi
for ac_site_file in $CONFIG_SITE; do
if test -r "$ac_site_file"; then
echo "loading site script $ac_site_file"
. "$ac_site_file"
fi
done
if test -r "$cache_file"; then
echo "loading cache $cache_file"
. $cache_file
else
echo "creating cache $cache_file"
> $cache_file
fi
ac_ext=c
# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
ac_cpp='$CPP $CPPFLAGS'
ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
cross_compiling=$ac_cv_prog_cc_cross
ac_exeext=
ac_objext=o
if (echo "testing\c"; echo 1,2,3) | grep c >/dev/null; then
# Stardent Vistra SVR4 grep lacks -e, says ghazi@caip.rutgers.edu.
if (echo -n testing; echo 1,2,3) | sed s/-n/xn/ | grep xn >/dev/null; then
ac_n= ac_c='
' ac_t=' '
else
ac_n=-n ac_c= ac_t=
fi
else
ac_n= ac_c='\c' ac_t=
fi
PCRE_MAJOR=3
PCRE_MINOR=4
PCRE_DATE=22-Aug-2000
PCRE_VERSION=${PCRE_MAJOR}.${PCRE_MINOR}
PCRE_LIB_VERSION=0:1:0
PCRE_POSIXLIB_VERSION=0:0:0
# Extract the first word of "gcc", so it can be a program name with args.
set dummy gcc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:546: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
if test -n "$CC"; then
ac_cv_prog_CC="$CC" # Let the user override the test.
else
IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
ac_dummy="$PATH"
for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
if test -f $ac_dir/$ac_word; then
ac_cv_prog_CC="gcc"
break
fi
done
IFS="$ac_save_ifs"
fi
fi
CC="$ac_cv_prog_CC"
if test -n "$CC"; then
echo "$ac_t""$CC" 1>&6
else
echo "$ac_t""no" 1>&6
fi
if test -z "$CC"; then
# Extract the first word of "cc", so it can be a program name with args.
set dummy cc; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:576: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
if test -n "$CC"; then
ac_cv_prog_CC="$CC" # Let the user override the test.
else
IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
ac_prog_rejected=no
ac_dummy="$PATH"
for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
if test -f $ac_dir/$ac_word; then
if test "$ac_dir/$ac_word" = "/usr/ucb/cc"; then
ac_prog_rejected=yes
continue
fi
ac_cv_prog_CC="cc"
break
fi
done
IFS="$ac_save_ifs"
if test $ac_prog_rejected = yes; then
# We found a bogon in the path, so make sure we never use it.
set dummy $ac_cv_prog_CC
shift
if test $# -gt 0; then
# We chose a different compiler from the bogus one.
# However, it has the same basename, so the bogon will be chosen
# first if we set CC to just the basename; use the full file name.
shift
set dummy "$ac_dir/$ac_word" "$@"
shift
ac_cv_prog_CC="$@"
fi
fi
fi
fi
CC="$ac_cv_prog_CC"
if test -n "$CC"; then
echo "$ac_t""$CC" 1>&6
else
echo "$ac_t""no" 1>&6
fi
if test -z "$CC"; then
case "`uname -s`" in
*win32* | *WIN32*)
# Extract the first word of "cl", so it can be a program name with args.
set dummy cl; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:627: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_CC'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
if test -n "$CC"; then
ac_cv_prog_CC="$CC" # Let the user override the test.
else
IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
ac_dummy="$PATH"
for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
if test -f $ac_dir/$ac_word; then
ac_cv_prog_CC="cl"
break
fi
done
IFS="$ac_save_ifs"
fi
fi
CC="$ac_cv_prog_CC"
if test -n "$CC"; then
echo "$ac_t""$CC" 1>&6
else
echo "$ac_t""no" 1>&6
fi
;;
esac
fi
test -z "$CC" && { echo "configure: error: no acceptable cc found in \$PATH" 1>&2; exit 1; }
fi
echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works""... $ac_c" 1>&6
echo "configure:659: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) works" >&5
ac_ext=c
# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
ac_cpp='$CPP $CPPFLAGS'
ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
cross_compiling=$ac_cv_prog_cc_cross
cat > conftest.$ac_ext << EOF
#line 670 "configure"
#include "confdefs.h"
main(){return(0);}
EOF
if { (eval echo configure:675: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
ac_cv_prog_cc_works=yes
# If we can't run a trivial program, we are probably using a cross compiler.
if (./conftest; exit) 2>/dev/null; then
ac_cv_prog_cc_cross=no
else
ac_cv_prog_cc_cross=yes
fi
else
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
ac_cv_prog_cc_works=no
fi
rm -fr conftest*
ac_ext=c
# CFLAGS is not in ac_cpp because -g, -O, etc. are not valid cpp options.
ac_cpp='$CPP $CPPFLAGS'
ac_compile='${CC-cc} -c $CFLAGS $CPPFLAGS conftest.$ac_ext 1>&5'
ac_link='${CC-cc} -o conftest${ac_exeext} $CFLAGS $CPPFLAGS $LDFLAGS conftest.$ac_ext $LIBS 1>&5'
cross_compiling=$ac_cv_prog_cc_cross
echo "$ac_t""$ac_cv_prog_cc_works" 1>&6
if test $ac_cv_prog_cc_works = no; then
{ echo "configure: error: installation or configuration problem: C compiler cannot create executables." 1>&2; exit 1; }
fi
echo $ac_n "checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler""... $ac_c" 1>&6
echo "configure:701: checking whether the C compiler ($CC $CFLAGS $LDFLAGS) is a cross-compiler" >&5
echo "$ac_t""$ac_cv_prog_cc_cross" 1>&6
cross_compiling=$ac_cv_prog_cc_cross
echo $ac_n "checking whether we are using GNU C""... $ac_c" 1>&6
echo "configure:706: checking whether we are using GNU C" >&5
if eval "test \"`echo '$''{'ac_cv_prog_gcc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.c <&5; (eval $ac_try) 2>&5; }; } | egrep yes >/dev/null 2>&1; then
ac_cv_prog_gcc=yes
else
ac_cv_prog_gcc=no
fi
fi
echo "$ac_t""$ac_cv_prog_gcc" 1>&6
if test $ac_cv_prog_gcc = yes; then
GCC=yes
else
GCC=
fi
ac_test_CFLAGS="${CFLAGS+set}"
ac_save_CFLAGS="$CFLAGS"
CFLAGS=
echo $ac_n "checking whether ${CC-cc} accepts -g""... $ac_c" 1>&6
echo "configure:734: checking whether ${CC-cc} accepts -g" >&5
if eval "test \"`echo '$''{'ac_cv_prog_cc_g'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
echo 'void f(){}' > conftest.c
if test -z "`${CC-cc} -g -c conftest.c 2>&1`"; then
ac_cv_prog_cc_g=yes
else
ac_cv_prog_cc_g=no
fi
rm -f conftest*
fi
echo "$ac_t""$ac_cv_prog_cc_g" 1>&6
if test "$ac_test_CFLAGS" = set; then
CFLAGS="$ac_save_CFLAGS"
elif test $ac_cv_prog_cc_g = yes; then
if test "$GCC" = yes; then
CFLAGS="-g -O2"
else
CFLAGS="-g"
fi
else
if test "$GCC" = yes; then
CFLAGS="-O2"
else
CFLAGS=
fi
fi
# Extract the first word of "ranlib", so it can be a program name with args.
set dummy ranlib; ac_word=$2
echo $ac_n "checking for $ac_word""... $ac_c" 1>&6
echo "configure:768: checking for $ac_word" >&5
if eval "test \"`echo '$''{'ac_cv_prog_RANLIB'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
if test -n "$RANLIB"; then
ac_cv_prog_RANLIB="$RANLIB" # Let the user override the test.
else
IFS="${IFS= }"; ac_save_ifs="$IFS"; IFS=":"
ac_dummy="$PATH"
for ac_dir in $ac_dummy; do
test -z "$ac_dir" && ac_dir=.
if test -f $ac_dir/$ac_word; then
ac_cv_prog_RANLIB="ranlib"
break
fi
done
IFS="$ac_save_ifs"
test -z "$ac_cv_prog_RANLIB" && ac_cv_prog_RANLIB=":"
fi
fi
RANLIB="$ac_cv_prog_RANLIB"
if test -n "$RANLIB"; then
echo "$ac_t""$RANLIB" 1>&6
else
echo "$ac_t""no" 1>&6
fi
echo $ac_n "checking how to run the C preprocessor""... $ac_c" 1>&6
echo "configure:798: checking how to run the C preprocessor" >&5
# On Suns, sometimes $CPP names a directory.
if test -n "$CPP" && test -d "$CPP"; then
CPP=
fi
if test -z "$CPP"; then
if eval "test \"`echo '$''{'ac_cv_prog_CPP'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
# This must be in double quotes, not single quotes, because CPP may get
# substituted into the Makefile and "${CC-cc}" will confuse make.
CPP="${CC-cc} -E"
# On the NeXT, cc -E runs the code through the compiler's parser,
# not just through cpp.
cat > conftest.$ac_ext <
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:819: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
CPP="${CC-cc} -E -traditional-cpp"
cat > conftest.$ac_ext <
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:836: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
CPP="${CC-cc} -nologo -E"
cat > conftest.$ac_ext <
Syntax Error
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:853: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
:
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
CPP=/lib/cpp
fi
rm -f conftest*
fi
rm -f conftest*
fi
rm -f conftest*
ac_cv_prog_CPP="$CPP"
fi
CPP="$ac_cv_prog_CPP"
else
ac_cv_prog_CPP="$CPP"
fi
echo "$ac_t""$CPP" 1>&6
echo $ac_n "checking for ANSI C header files""... $ac_c" 1>&6
echo "configure:878: checking for ANSI C header files" >&5
if eval "test \"`echo '$''{'ac_cv_header_stdc'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <
#include
#include
#include
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:891: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
ac_cv_header_stdc=yes
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
ac_cv_header_stdc=no
fi
rm -f conftest*
if test $ac_cv_header_stdc = yes; then
# SunOS 4.x string.h does not declare mem*, contrary to ANSI.
cat > conftest.$ac_ext <
EOF
if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
egrep "memchr" >/dev/null 2>&1; then
:
else
rm -rf conftest*
ac_cv_header_stdc=no
fi
rm -f conftest*
fi
if test $ac_cv_header_stdc = yes; then
# ISC 2.0.2 stdlib.h does not declare free, contrary to ANSI.
cat > conftest.$ac_ext <
EOF
if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
egrep "free" >/dev/null 2>&1; then
:
else
rm -rf conftest*
ac_cv_header_stdc=no
fi
rm -f conftest*
fi
if test $ac_cv_header_stdc = yes; then
# /bin/cc in Irix-4.0.5 gets non-ANSI ctype macros unless using -ansi.
if test "$cross_compiling" = yes; then
:
else
cat > conftest.$ac_ext <
#define ISLOWER(c) ('a' <= (c) && (c) <= 'z')
#define TOUPPER(c) (ISLOWER(c) ? 'A' + ((c) - 'a') : (c))
#define XOR(e, f) (((e) && !(f)) || (!(e) && (f)))
int main () { int i; for (i = 0; i < 256; i++)
if (XOR (islower (i), ISLOWER (i)) || toupper (i) != TOUPPER (i)) exit(2);
exit (0); }
EOF
if { (eval echo configure:958: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext} && (./conftest; exit) 2>/dev/null
then
:
else
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -fr conftest*
ac_cv_header_stdc=no
fi
rm -fr conftest*
fi
fi
fi
echo "$ac_t""$ac_cv_header_stdc" 1>&6
if test $ac_cv_header_stdc = yes; then
cat >> confdefs.h <<\EOF
#define STDC_HEADERS 1
EOF
fi
for ac_hdr in limits.h
do
ac_safe=`echo "$ac_hdr" | sed 'y%./+-%__p_%'`
echo $ac_n "checking for $ac_hdr""... $ac_c" 1>&6
echo "configure:985: checking for $ac_hdr" >&5
if eval "test \"`echo '$''{'ac_cv_header_$ac_safe'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <
EOF
ac_try="$ac_cpp conftest.$ac_ext >/dev/null 2>conftest.out"
{ (eval echo configure:995: \"$ac_try\") 1>&5; (eval $ac_try) 2>&5; }
ac_err=`grep -v '^ *+' conftest.out | grep -v "^conftest.${ac_ext}\$"`
if test -z "$ac_err"; then
rm -rf conftest*
eval "ac_cv_header_$ac_safe=yes"
else
echo "$ac_err" >&5
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
eval "ac_cv_header_$ac_safe=no"
fi
rm -f conftest*
fi
if eval "test \"`echo '$ac_cv_header_'$ac_safe`\" = yes"; then
echo "$ac_t""yes" 1>&6
ac_tr_hdr=HAVE_`echo $ac_hdr | sed 'y%abcdefghijklmnopqrstuvwxyz./-%ABCDEFGHIJKLMNOPQRSTUVWXYZ___%'`
cat >> confdefs.h <&6
fi
done
echo $ac_n "checking for working const""... $ac_c" 1>&6
echo "configure:1024: checking for working const" >&5
if eval "test \"`echo '$''{'ac_cv_c_const'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <j = 5;
}
{ /* ULTRIX-32 V3.1 (Rev 9) vcc rejects this */
const int foo = 10;
}
; return 0; }
EOF
if { (eval echo configure:1078: \"$ac_compile\") 1>&5; (eval $ac_compile) 2>&5; }; then
rm -rf conftest*
ac_cv_c_const=yes
else
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
ac_cv_c_const=no
fi
rm -f conftest*
fi
echo "$ac_t""$ac_cv_c_const" 1>&6
if test $ac_cv_c_const = no; then
cat >> confdefs.h <<\EOF
#define const
EOF
fi
echo $ac_n "checking for size_t""... $ac_c" 1>&6
echo "configure:1099: checking for size_t" >&5
if eval "test \"`echo '$''{'ac_cv_type_size_t'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <
#if STDC_HEADERS
#include
#include
#endif
EOF
if (eval "$ac_cpp conftest.$ac_ext") 2>&5 |
egrep "(^|[^a-zA-Z_0-9])size_t[^a-zA-Z_0-9]" >/dev/null 2>&1; then
rm -rf conftest*
ac_cv_type_size_t=yes
else
rm -rf conftest*
ac_cv_type_size_t=no
fi
rm -f conftest*
fi
echo "$ac_t""$ac_cv_type_size_t" 1>&6
if test $ac_cv_type_size_t = no; then
cat >> confdefs.h <<\EOF
#define size_t unsigned
EOF
fi
for ac_func in bcopy memmove strerror
do
echo $ac_n "checking for $ac_func""... $ac_c" 1>&6
echo "configure:1136: checking for $ac_func" >&5
if eval "test \"`echo '$''{'ac_cv_func_$ac_func'+set}'`\" = set"; then
echo $ac_n "(cached) $ac_c" 1>&6
else
cat > conftest.$ac_ext <
/* Override any gcc2 internal prototype to avoid an error. */
/* We use char because int might match the return type of a gcc2
builtin and then its argument prototype would still apply. */
char $ac_func();
int main() {
/* The GNU C library defines this for functions which it implements
to always fail with ENOSYS. Some functions are actually named
something starting with __ and the normal name is an alias. */
#if defined (__stub_$ac_func) || defined (__stub___$ac_func)
choke me
#else
$ac_func();
#endif
; return 0; }
EOF
if { (eval echo configure:1164: \"$ac_link\") 1>&5; (eval $ac_link) 2>&5; } && test -s conftest${ac_exeext}; then
rm -rf conftest*
eval "ac_cv_func_$ac_func=yes"
else
echo "configure: failed program was:" >&5
cat conftest.$ac_ext >&5
rm -rf conftest*
eval "ac_cv_func_$ac_func=no"
fi
rm -f conftest*
fi
if eval "test \"`echo '$ac_cv_func_'$ac_func`\" = yes"; then
echo "$ac_t""yes" 1>&6
ac_tr_func=HAVE_`echo $ac_func | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'`
cat >> confdefs.h <&6
fi
done
LIBTOOL=./libtool
LIBSUFFIX=la
# Check whether --enable-shared or --disable-shared was given.
if test "${enable_shared+set}" = set; then
enableval="$enable_shared"
if test "$enableval" = "no"; then
LIBTOOL=
LIBSUFFIX=a
fi
fi
# Check whether --enable-utf8 or --disable-utf8 was given.
if test "${enable_utf8+set}" = set; then
enableval="$enable_utf8"
if test "$enableval" = "yes"; then
UTF8=-DSUPPORT_UTF8
fi
fi
trap '' 1 2 15
cat > confcache <<\EOF
# This file is a shell script that caches the results of configure
# tests run on this system so they can be shared between configure
# scripts and configure runs. It is not useful on other systems.
# If it contains results you don't want to keep, you may remove or edit it.
#
# By default, configure uses ./config.cache as the cache file,
# creating it if it does not exist already. You can give configure
# the --cache-file=FILE option to use a different cache file; that is
# what configure does when it calls configure scripts in
# subdirectories, so they share the cache.
# Giving --cache-file=/dev/null disables caching, for debugging configure.
# config.status only pays attention to the cache file if you give it the
# --recheck option to rerun configure.
#
EOF
# The following way of writing the cache mishandles newlines in values,
# but we know of no workaround that is simple, portable, and efficient.
# So, don't put newlines in cache variables' values.
# Ultrix sh set writes to stderr and can't be redirected directly,
# and sets the high bit in the cache file unless we assign to the vars.
(set) 2>&1 |
case `(ac_space=' '; set | grep ac_space) 2>&1` in
*ac_space=\ *)
# `set' does not quote correctly, so add quotes (double-quote substitution
# turns \\\\ into \\, and sed turns \\ into \).
sed -n \
-e "s/'/'\\\\''/g" \
-e "s/^\\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\\)=\\(.*\\)/\\1=\${\\1='\\2'}/p"
;;
*)
# `set' quotes correctly as required by POSIX, so do not add quotes.
sed -n -e 's/^\([a-zA-Z0-9_]*_cv_[a-zA-Z0-9_]*\)=\(.*\)/\1=${\1=\2}/p'
;;
esac >> confcache
if cmp -s $cache_file confcache; then
:
else
if test -w $cache_file; then
echo "updating cache $cache_file"
cat confcache > $cache_file
else
echo "not updating unwritable cache $cache_file"
fi
fi
rm -f confcache
trap 'rm -fr conftest* confdefs* core core.* *.core $ac_clean_files; exit 1' 1 2 15
test "x$prefix" = xNONE && prefix=$ac_default_prefix
# Let make expand exec_prefix.
test "x$exec_prefix" = xNONE && exec_prefix='${prefix}'
# Any assignment to VPATH causes Sun make to only execute
# the first set of double-colon rules, so remove it if not needed.
# If there is a colon in the path, we need to keep it.
if test "x$srcdir" = x.; then
ac_vpsub='/^[ ]*VPATH[ ]*=[^:]*$/d'
fi
trap 'rm -f $CONFIG_STATUS conftest*; exit 1' 1 2 15
DEFS=-DHAVE_CONFIG_H
# Without the "./", some shells look in PATH for config.status.
: ${CONFIG_STATUS=./config.status}
echo creating $CONFIG_STATUS
rm -f $CONFIG_STATUS
cat > $CONFIG_STATUS </dev/null | sed 1q`:
#
# $0 $ac_configure_args
#
# Compiler output produced by configure, useful for debugging
# configure, is in ./config.log if it exists.
ac_cs_usage="Usage: $CONFIG_STATUS [--recheck] [--version] [--help]"
for ac_option
do
case "\$ac_option" in
-recheck | --recheck | --rechec | --reche | --rech | --rec | --re | --r)
echo "running \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion"
exec \${CONFIG_SHELL-/bin/sh} $0 $ac_configure_args --no-create --no-recursion ;;
-version | --version | --versio | --versi | --vers | --ver | --ve | --v)
echo "$CONFIG_STATUS generated by autoconf version 2.13"
exit 0 ;;
-help | --help | --hel | --he | --h)
echo "\$ac_cs_usage"; exit 0 ;;
*) echo "\$ac_cs_usage"; exit 1 ;;
esac
done
ac_given_srcdir=$srcdir
trap 'rm -fr `echo "Makefile pcre.h:pcre.in pcre-config:pcre-config.in RunTest:RunTest.in config.h:config.in" | sed "s/:[^ ]*//g"` conftest*; exit 1' 1 2 15
EOF
cat >> $CONFIG_STATUS < conftest.subs <<\\CEOF
$ac_vpsub
$extrasub
s%@SHELL@%$SHELL%g
s%@CFLAGS@%$CFLAGS%g
s%@CPPFLAGS@%$CPPFLAGS%g
s%@CXXFLAGS@%$CXXFLAGS%g
s%@FFLAGS@%$FFLAGS%g
s%@DEFS@%$DEFS%g
s%@LDFLAGS@%$LDFLAGS%g
s%@LIBS@%$LIBS%g
s%@exec_prefix@%$exec_prefix%g
s%@prefix@%$prefix%g
s%@program_transform_name@%$program_transform_name%g
s%@bindir@%$bindir%g
s%@sbindir@%$sbindir%g
s%@libexecdir@%$libexecdir%g
s%@datadir@%$datadir%g
s%@sysconfdir@%$sysconfdir%g
s%@sharedstatedir@%$sharedstatedir%g
s%@localstatedir@%$localstatedir%g
s%@libdir@%$libdir%g
s%@includedir@%$includedir%g
s%@oldincludedir@%$oldincludedir%g
s%@infodir@%$infodir%g
s%@mandir@%$mandir%g
s%@CC@%$CC%g
s%@RANLIB@%$RANLIB%g
s%@CPP@%$CPP%g
s%@HAVE_MEMMOVE@%$HAVE_MEMMOVE%g
s%@HAVE_STRERROR@%$HAVE_STRERROR%g
s%@LIBTOOL@%$LIBTOOL%g
s%@LIBSUFFIX@%$LIBSUFFIX%g
s%@UTF8@%$UTF8%g
s%@PCRE_MAJOR@%$PCRE_MAJOR%g
s%@PCRE_MINOR@%$PCRE_MINOR%g
s%@PCRE_DATE@%$PCRE_DATE%g
s%@PCRE_VERSION@%$PCRE_VERSION%g
s%@PCRE_LIB_VERSION@%$PCRE_LIB_VERSION%g
s%@PCRE_POSIXLIB_VERSION@%$PCRE_POSIXLIB_VERSION%g
CEOF
EOF
cat >> $CONFIG_STATUS <<\EOF
# Split the substitutions into bite-sized pieces for seds with
# small command number limits, like on Digital OSF/1 and HP-UX.
ac_max_sed_cmds=90 # Maximum number of lines to put in a sed script.
ac_file=1 # Number of current file.
ac_beg=1 # First line for current file.
ac_end=$ac_max_sed_cmds # Line after last line for current file.
ac_more_lines=:
ac_sed_cmds=""
while $ac_more_lines; do
if test $ac_beg -gt 1; then
sed "1,${ac_beg}d; ${ac_end}q" conftest.subs > conftest.s$ac_file
else
sed "${ac_end}q" conftest.subs > conftest.s$ac_file
fi
if test ! -s conftest.s$ac_file; then
ac_more_lines=false
rm -f conftest.s$ac_file
else
if test -z "$ac_sed_cmds"; then
ac_sed_cmds="sed -f conftest.s$ac_file"
else
ac_sed_cmds="$ac_sed_cmds | sed -f conftest.s$ac_file"
fi
ac_file=`expr $ac_file + 1`
ac_beg=$ac_end
ac_end=`expr $ac_end + $ac_max_sed_cmds`
fi
done
if test -z "$ac_sed_cmds"; then
ac_sed_cmds=cat
fi
EOF
cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF
for ac_file in .. $CONFIG_FILES; do if test "x$ac_file" != x..; then
# Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
case "$ac_file" in
*:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'`
ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;;
*) ac_file_in="${ac_file}.in" ;;
esac
# Adjust a relative srcdir, top_srcdir, and INSTALL for subdirectories.
# Remove last slash and all that follows it. Not all systems have dirname.
ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'`
if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
# The file is in a subdirectory.
test ! -d "$ac_dir" && mkdir "$ac_dir"
ac_dir_suffix="/`echo $ac_dir|sed 's%^\./%%'`"
# A "../" for each directory in $ac_dir_suffix.
ac_dots=`echo $ac_dir_suffix|sed 's%/[^/]*%../%g'`
else
ac_dir_suffix= ac_dots=
fi
case "$ac_given_srcdir" in
.) srcdir=.
if test -z "$ac_dots"; then top_srcdir=.
else top_srcdir=`echo $ac_dots|sed 's%/$%%'`; fi ;;
/*) srcdir="$ac_given_srcdir$ac_dir_suffix"; top_srcdir="$ac_given_srcdir" ;;
*) # Relative path.
srcdir="$ac_dots$ac_given_srcdir$ac_dir_suffix"
top_srcdir="$ac_dots$ac_given_srcdir" ;;
esac
echo creating "$ac_file"
rm -f "$ac_file"
configure_input="Generated automatically from `echo $ac_file_in|sed 's%.*/%%'` by configure."
case "$ac_file" in
*Makefile*) ac_comsub="1i\\
# $configure_input" ;;
*) ac_comsub= ;;
esac
ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"`
sed -e "$ac_comsub
s%@configure_input@%$configure_input%g
s%@srcdir@%$srcdir%g
s%@top_srcdir@%$top_srcdir%g
" $ac_file_inputs | (eval "$ac_sed_cmds") > $ac_file
fi; done
rm -f conftest.s*
# These sed commands are passed to sed as "A NAME B NAME C VALUE D", where
# NAME is the cpp macro being defined and VALUE is the value it is being given.
#
# ac_d sets the value in "#define NAME VALUE" lines.
ac_dA='s%^\([ ]*\)#\([ ]*define[ ][ ]*\)'
ac_dB='\([ ][ ]*\)[^ ]*%\1#\2'
ac_dC='\3'
ac_dD='%g'
# ac_u turns "#undef NAME" with trailing blanks into "#define NAME VALUE".
ac_uA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)'
ac_uB='\([ ]\)%\1#\2define\3'
ac_uC=' '
ac_uD='\4%g'
# ac_e turns "#undef NAME" without trailing blanks into "#define NAME VALUE".
ac_eA='s%^\([ ]*\)#\([ ]*\)undef\([ ][ ]*\)'
ac_eB='$%\1#\2define\3'
ac_eC=' '
ac_eD='%g'
if test "${CONFIG_HEADERS+set}" != set; then
EOF
cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF
fi
for ac_file in .. $CONFIG_HEADERS; do if test "x$ac_file" != x..; then
# Support "outfile[:infile[:infile...]]", defaulting infile="outfile.in".
case "$ac_file" in
*:*) ac_file_in=`echo "$ac_file"|sed 's%[^:]*:%%'`
ac_file=`echo "$ac_file"|sed 's%:.*%%'` ;;
*) ac_file_in="${ac_file}.in" ;;
esac
echo creating $ac_file
rm -f conftest.frag conftest.in conftest.out
ac_file_inputs=`echo $ac_file_in|sed -e "s%^%$ac_given_srcdir/%" -e "s%:% $ac_given_srcdir/%g"`
cat $ac_file_inputs > conftest.in
EOF
# Transform confdefs.h into a sed script conftest.vals that substitutes
# the proper values into config.h.in to produce config.h. And first:
# Protect against being on the right side of a sed subst in config.status.
# Protect against being in an unquoted here document in config.status.
rm -f conftest.vals
cat > conftest.hdr <<\EOF
s/[\\&%]/\\&/g
s%[\\$`]%\\&%g
s%#define \([A-Za-z_][A-Za-z0-9_]*\) *\(.*\)%${ac_dA}\1${ac_dB}\1${ac_dC}\2${ac_dD}%gp
s%ac_d%ac_u%gp
s%ac_u%ac_e%gp
EOF
sed -n -f conftest.hdr confdefs.h > conftest.vals
rm -f conftest.hdr
# This sed command replaces #undef with comments. This is necessary, for
# example, in the case of _POSIX_SOURCE, which is predefined and required
# on some systems where configure will not decide to define it.
cat >> conftest.vals <<\EOF
s%^[ ]*#[ ]*undef[ ][ ]*[a-zA-Z_][a-zA-Z_0-9]*%/* & */%
EOF
# Break up conftest.vals because some shells have a limit on
# the size of here documents, and old seds have small limits too.
rm -f conftest.tail
while :
do
ac_lines=`grep -c . conftest.vals`
# grep -c gives empty output for an empty file on some AIX systems.
if test -z "$ac_lines" || test "$ac_lines" -eq 0; then break; fi
# Write a limited-size here document to conftest.frag.
echo ' cat > conftest.frag <> $CONFIG_STATUS
sed ${ac_max_here_lines}q conftest.vals >> $CONFIG_STATUS
echo 'CEOF
sed -f conftest.frag conftest.in > conftest.out
rm -f conftest.in
mv conftest.out conftest.in
' >> $CONFIG_STATUS
sed 1,${ac_max_here_lines}d conftest.vals > conftest.tail
rm -f conftest.vals
mv conftest.tail conftest.vals
done
rm -f conftest.vals
cat >> $CONFIG_STATUS <<\EOF
rm -f conftest.frag conftest.h
echo "/* $ac_file. Generated automatically by configure. */" > conftest.h
cat conftest.in >> conftest.h
rm -f conftest.in
if cmp -s $ac_file conftest.h 2>/dev/null; then
echo "$ac_file is unchanged"
rm -f conftest.h
else
# Remove last slash and all that follows it. Not all systems have dirname.
ac_dir=`echo $ac_file|sed 's%/[^/][^/]*$%%'`
if test "$ac_dir" != "$ac_file" && test "$ac_dir" != .; then
# The file is in a subdirectory.
test ! -d "$ac_dir" && mkdir "$ac_dir"
fi
rm -f $ac_file
mv conftest.h $ac_file
fi
fi; done
EOF
cat >> $CONFIG_STATUS <> $CONFIG_STATUS <<\EOF
chmod a+x RunTest pcre-config
exit 0
EOF
chmod +x $CONFIG_STATUS
rm -fr confdefs* $ac_clean_files
test "$no_create" = yes || ${CONFIG_SHELL-/bin/sh} $CONFIG_STATUS || exit 1
privoxy-3.0.21-stable/./pcre/pcre-config 000750 001751 001751 00000002126 10546014100 017024 0 ustar 00fk fk 000000 000000 #!/bin/sh
prefix=/usr/local
exec_prefix=${prefix}
exec_prefix_set=no
usage="\
Usage: pcre-config [--prefix] [--exec-prefix] [--version] [--libs] [--libs-posix] [--cflags] [--cflags-posix]"
if test $# -eq 0; then
echo "${usage}" 1>&2
exit 1
fi
while test $# -gt 0; do
case "$1" in
-*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
*) optarg= ;;
esac
case $1 in
--prefix=*)
prefix=$optarg
if test $exec_prefix_set = no ; then
exec_prefix=$optarg
fi
;;
--prefix)
echo $prefix
;;
--exec-prefix=*)
exec_prefix=$optarg
exec_prefix_set=yes
;;
--exec-prefix)
echo $exec_prefix
;;
--version)
echo 3.4
;;
--cflags | --cflags-posix)
if test ${prefix}/include != /usr/include ; then
includes=-I${prefix}/include
fi
echo $includes
;;
--libs-posix)
echo -L${exec_prefix}/lib -lpcreposix -lpcre
;;
--libs)
echo -L${exec_prefix}/lib -lpcre
;;
*)
echo "${usage}" 1>&2
exit 1
;;
esac
shift
done
privoxy-3.0.21-stable/./pcre/pcregrep.c 000640 001751 001751 00000011572 10546014100 016663 0 ustar 00fk fk 000000 000000 /*************************************************
* pcregrep program *
*************************************************/
/* This is a grep program that uses the PCRE regular expression library to do
its pattern matching. */
#include
#include
#include
#include
#include "config.h"
#include "pcre.h"
#define FALSE 0
#define TRUE 1
typedef int BOOL;
/*************************************************
* Global variables *
*************************************************/
static pcre *pattern;
static pcre_extra *hints;
static BOOL count_only = FALSE;
static BOOL filenames_only = FALSE;
static BOOL invert = FALSE;
static BOOL number = FALSE;
static BOOL silent = FALSE;
static BOOL whole_lines = FALSE;
#if ! HAVE_STRERROR
/*************************************************
* Provide strerror() for non-ANSI libraries *
*************************************************/
/* Some old-fashioned systems still around (e.g. SunOS4) don't have strerror()
in their libraries, but can provide the same facility by this simple
alternative function. */
extern int sys_nerr;
extern char *sys_errlist[];
char *
strerror(int n)
{
if (n < 0 || n >= sys_nerr) return "unknown error number";
return sys_errlist[n];
}
#endif /* HAVE_STRERROR */
/*************************************************
* Grep an individual file *
*************************************************/
static int
pcregrep(FILE *in, char *name)
{
int rc = 1;
int linenumber = 0;
int count = 0;
int offsets[99];
char buffer[BUFSIZ];
while (fgets(buffer, sizeof(buffer), in) != NULL)
{
BOOL match;
int length = (int)strlen(buffer);
if (length > 0 && buffer[length-1] == '\n') buffer[--length] = 0;
linenumber++;
match = pcre_exec(pattern, hints, buffer, length, 0, 0, offsets, 99) >= 0;
if (match && whole_lines && offsets[1] != length) match = FALSE;
if (match != invert)
{
if (count_only) count++;
else if (filenames_only)
{
fprintf(stdout, "%s\n", (name == NULL)? "" : name);
return 0;
}
else if (silent) return 0;
else
{
if (name != NULL) fprintf(stdout, "%s:", name);
if (number) fprintf(stdout, "%d:", linenumber);
fprintf(stdout, "%s\n", buffer);
}
rc = 0;
}
}
if (count_only)
{
if (name != NULL) fprintf(stdout, "%s:", name);
fprintf(stdout, "%d\n", count);
}
return rc;
}
/*************************************************
* Usage function *
*************************************************/
static int
usage(int rc)
{
fprintf(stderr, "Usage: pcregrep [-Vchilnsvx] pattern [file] ...\n");
return rc;
}
/*************************************************
* Main program *
*************************************************/
int
main(int argc, char **argv)
{
int i;
int rc = 1;
int options = 0;
int errptr;
const char *error;
BOOL filenames = TRUE;
/* Process the options */
for (i = 1; i < argc; i++)
{
char *s;
if (argv[i][0] != '-') break;
s = argv[i] + 1;
while (*s != 0)
{
switch (*s++)
{
case 'c': count_only = TRUE; break;
case 'h': filenames = FALSE; break;
case 'i': options |= PCRE_CASELESS; break;
case 'l': filenames_only = TRUE;
case 'n': number = TRUE; break;
case 's': silent = TRUE; break;
case 'v': invert = TRUE; break;
case 'x': whole_lines = TRUE; options |= PCRE_ANCHORED; break;
case 'V':
fprintf(stderr, "PCRE version %s\n", pcre_version());
break;
default:
fprintf(stderr, "pcregrep: unknown option %c\n", s[-1]);
return usage(2);
}
}
}
/* There must be at least a regexp argument */
if (i >= argc) return usage(0);
/* Compile the regular expression. */
pattern = pcre_compile(argv[i++], options, &error, &errptr, NULL);
if (pattern == NULL)
{
fprintf(stderr, "pcregrep: error in regex at offset %d: %s\n", errptr, error);
return 2;
}
/* Study the regular expression, as we will be running it may times */
hints = pcre_study(pattern, 0, &error);
if (error != NULL)
{
fprintf(stderr, "pcregrep: error while studing regex: %s\n", error);
return 2;
}
/* If there are no further arguments, do the business on stdin and exit */
if (i >= argc) return pcregrep(stdin, NULL);
/* Otherwise, work through the remaining arguments as files. If there is only
one, don't give its name on the output. */
if (i == argc - 1) filenames = FALSE;
if (filenames_only) filenames = TRUE;
for (; i < argc; i++)
{
FILE *in = fopen(argv[i], "r");
if (in == NULL)
{
fprintf(stderr, "%s: failed to open: %s\n", argv[i], strerror(errno));
rc = 2;
}
else
{
int frc = pcregrep(in, filenames? argv[i] : NULL);
if (frc == 0 && rc == 1) rc = 0;
fclose(in);
}
}
return rc;
}
/* End */
privoxy-3.0.21-stable/./pcre/vc_dftables.dsp 000750 001751 001751 00000022574 10546014100 017702 0 ustar 00fk fk 000000 000000 # Microsoft Developer Studio Project File - Name="vc_dftables" - Package Owner=<4>
# Microsoft Developer Studio Generated Build File, Format Version 5.00
# ** DO NOT EDIT **
# TARGTYPE "Win32 (x86) Console Application" 0x0103
CFG=vc_dftables - Win32 Debug with Win32 threads
!MESSAGE This is not a valid makefile. To build this project using NMAKE,
!MESSAGE use the Export Makefile command and run
!MESSAGE
!MESSAGE NMAKE /f "vc_dftables.mak".
!MESSAGE
!MESSAGE You can specify a configuration when running NMAKE
!MESSAGE by defining the macro CFG on the command line. For example:
!MESSAGE
!MESSAGE NMAKE /f "vc_dftables.mak"\
CFG="vc_dftables - Win32 Debug with Win32 threads"
!MESSAGE
!MESSAGE Possible choices for configuration are:
!MESSAGE
!MESSAGE "vc_dftables - Win32 Release" (based on\
"Win32 (x86) Console Application")
!MESSAGE "vc_dftables - Win32 Debug" (based on\
"Win32 (x86) Console Application")
!MESSAGE "vc_dftables - Win32 Debug with Win32 threads" (based on\
"Win32 (x86) Console Application")
!MESSAGE "vc_dftables - Win32 Release with Win32 threads" (based on\
"Win32 (x86) Console Application")
!MESSAGE
# Begin Project
# PROP Scc_ProjName ""
# PROP Scc_LocalPath ""
CPP=cl.exe
RSC=rc.exe
!IF "$(CFG)" == "vc_dftables - Win32 Release"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 0
# PROP BASE Output_Dir "Release"
# PROP BASE Intermediate_Dir "Release"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 0
# PROP Output_Dir "vc_dftables"
# PROP Intermediate_Dir "vc_dftables"
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD BASE RSC /l 0x809 /d "NDEBUG"
# ADD RSC /l 0x809 /d "NDEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# Begin Special Build Tool
OutDir=.\vc_dftables
SOURCE=$(InputPath)
PostBuild_Desc=Running program to generate chartables.c
PostBuild_Cmds=$(OutDir)\vc_dftables.exe >$(OutDir)\..\chartables.c
# End Special Build Tool
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 1
# PROP BASE Output_Dir "Debug"
# PROP BASE Intermediate_Dir "Debug"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 1
# PROP Output_Dir "vc_dftables_dbg"
# PROP Intermediate_Dir "vc_dftables_dbg"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD BASE RSC /l 0x809 /d "_DEBUG"
# ADD RSC /l 0x809 /d "_DEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# Begin Special Build Tool
OutDir=.\vc_dftables_dbg
SOURCE=$(InputPath)
PostBuild_Desc=Running program to generate chartables.c
PostBuild_Cmds=$(OutDir)\vc_dftables.exe >$(OutDir)\..\chartables.c
# End Special Build Tool
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug with Win32 threads"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 1
# PROP BASE Output_Dir "vc_dftab"
# PROP BASE Intermediate_Dir "vc_dftab"
# PROP BASE Ignore_Export_Lib 0
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 1
# PROP Output_Dir "vc_dftables_dbg"
# PROP Intermediate_Dir "vc_dftables_dbg"
# PROP Ignore_Export_Lib 0
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD CPP /nologo /W3 /Gm /GX /Zi /Od /D "WIN32" /D "_DEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD BASE RSC /l 0x809 /d "_DEBUG"
# ADD RSC /l 0x809 /d "_DEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /debug /machine:I386 /pdbtype:sept
# Begin Special Build Tool
OutDir=.\vc_dftables_dbg
SOURCE=$(InputPath)
PostBuild_Desc=Running program to generate chartables.c
PostBuild_Cmds=$(OutDir)\vc_dftables.exe >$(OutDir)\..\chartables.c
# End Special Build Tool
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Release with Win32 threads"
# PROP BASE Use_MFC 0
# PROP BASE Use_Debug_Libraries 0
# PROP BASE Output_Dir "vc_dfta0"
# PROP BASE Intermediate_Dir "vc_dfta0"
# PROP BASE Target_Dir ""
# PROP Use_MFC 0
# PROP Use_Debug_Libraries 0
# PROP Output_Dir "vc_dftables"
# PROP Intermediate_Dir "vc_dftables"
# PROP Target_Dir ""
# ADD BASE CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD CPP /nologo /W3 /GX /O2 /D "WIN32" /D "NDEBUG" /D "_CONSOLE" /D "_MBCS" /YX /FD /c
# ADD BASE RSC /l 0x809 /d "NDEBUG"
# ADD RSC /l 0x809 /d "NDEBUG"
BSC32=bscmake.exe
# ADD BASE BSC32 /nologo
# ADD BSC32 /nologo
LINK32=link.exe
# ADD BASE LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# ADD LINK32 kernel32.lib user32.lib gdi32.lib winspool.lib comdlg32.lib advapi32.lib shell32.lib ole32.lib oleaut32.lib uuid.lib odbc32.lib odbccp32.lib /nologo /subsystem:console /machine:I386
# Begin Special Build Tool
OutDir=.\vc_dftables
SOURCE=$(InputPath)
PostBuild_Desc=Running program to generate chartables.c
PostBuild_Cmds=$(OutDir)\vc_dftables.exe >$(OutDir)\..\chartables.c
# End Special Build Tool
!ENDIF
# Begin Target
# Name "vc_dftables - Win32 Release"
# Name "vc_dftables - Win32 Debug"
# Name "vc_dftables - Win32 Debug with Win32 threads"
# Name "vc_dftables - Win32 Release with Win32 threads"
# Begin Group "File Copy"
# PROP Default_Filter ""
# Begin Source File
SOURCE=..\vc_config_pthreads.h
!IF "$(CFG)" == "vc_dftables - Win32 Release"
# PROP Ignore_Default_Tool 1
# Begin Custom Build - Copying vc_config_pthreads.h
WkspDir=.
InputPath=..\vc_config_pthreads.h
"$(WkspDir)\..\config.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy "$(InputPath)" "$(WkspDir)\..\config.h"
# End Custom Build
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug"
# PROP Ignore_Default_Tool 1
# Begin Custom Build - Copying vc_config_pthreads.h
WkspDir=.
InputPath=..\vc_config_pthreads.h
"$(WkspDir)\..\config.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy "$(InputPath)" "$(WkspDir)\..\config.h"
# End Custom Build
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug with Win32 threads"
# PROP Exclude_From_Build 1
# PROP Ignore_Default_Tool 1
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Release with Win32 threads"
# PROP Exclude_From_Build 1
# PROP Ignore_Default_Tool 1
!ENDIF
# End Source File
# Begin Source File
SOURCE=..\vc_config_winthreads.h
!IF "$(CFG)" == "vc_dftables - Win32 Release"
# PROP Exclude_From_Build 1
# PROP Ignore_Default_Tool 1
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug"
# PROP Exclude_From_Build 1
# PROP Ignore_Default_Tool 1
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug with Win32 threads"
# PROP Ignore_Default_Tool 1
# Begin Custom Build - Copying vc_config_winthreads.h
WkspDir=.
InputPath=..\vc_config_winthreads.h
"$(WkspDir)\..\config.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy "$(InputPath)" "$(WkspDir)\..\config.h"
# End Custom Build
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Release with Win32 threads"
# PROP Ignore_Default_Tool 1
# Begin Custom Build - Copying vc_config_winthreads.h
WkspDir=.
InputPath=..\vc_config_winthreads.h
"$(WkspDir)\..\config.h" : $(SOURCE) "$(INTDIR)" "$(OUTDIR)"
copy "$(InputPath)" "$(WkspDir)\..\config.h"
# End Custom Build
!ENDIF
# End Source File
# End Group
# Begin Source File
SOURCE=..\config.h
# End Source File
# Begin Source File
SOURCE=.\config.h
# End Source File
# Begin Source File
SOURCE=.\dftables.c
# End Source File
# Begin Source File
SOURCE=.\internal.h
# End Source File
# Begin Source File
SOURCE=.\maketables.c
!IF "$(CFG)" == "vc_dftables - Win32 Release"
# PROP Exclude_From_Build 1
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug"
# PROP Exclude_From_Build 1
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Debug with Win32 threads"
# PROP BASE Exclude_From_Build 1
# PROP Exclude_From_Build 1
!ELSEIF "$(CFG)" == "vc_dftables - Win32 Release with Win32 threads"
# PROP BASE Exclude_From_Build 1
# PROP Exclude_From_Build 1
!ENDIF
# End Source File
# Begin Source File
SOURCE=.\pcre.h
# End Source File
# End Target
# End Project
privoxy-3.0.21-stable/./pcre/doc/pcreposix.3 000640 001751 001751 00000013625 10546014100 017556 0 ustar 00fk fk 000000 000000 .TH PCRE 3
.SH NAME
pcreposix - POSIX API for Perl-compatible regular expressions.
.SH SYNOPSIS
.B #include
.PP
.SM
.br
.B int regcomp(regex_t *\fIpreg\fR, const char *\fIpattern\fR,
.ti +5n
.B int \fIcflags\fR);
.PP
.br
.B int regexec(regex_t *\fIpreg\fR, const char *\fIstring\fR,
.ti +5n
.B size_t \fInmatch\fR, regmatch_t \fIpmatch\fR[], int \fIeflags\fR);
.PP
.br
.B size_t regerror(int \fIerrcode\fR, const regex_t *\fIpreg\fR,
.ti +5n
.B char *\fIerrbuf\fR, size_t \fIerrbuf_size\fR);
.PP
.br
.B void regfree(regex_t *\fIpreg\fR);
.SH DESCRIPTION
This set of functions provides a POSIX-style API to the PCRE regular expression
package. See the \fBpcre\fR documentation for a description of the native API,
which contains additional functionality.
The functions described here are just wrapper functions that ultimately call
the native API. Their prototypes are defined in the \fBpcreposix.h\fR header
file, and on Unix systems the library itself is called \fBpcreposix.a\fR, so
can be accessed by adding \fB-lpcreposix\fR to the command for linking an
application which uses them. Because the POSIX functions call the native ones,
it is also necessary to add \fR-lpcre\fR.
I have implemented only those option bits that can be reasonably mapped to PCRE
native options. In addition, the options REG_EXTENDED and REG_NOSUB are defined
with the value zero. They have no effect, but since programs that are written
to the POSIX interface often use them, this makes it easier to slot in PCRE as
a replacement library. Other POSIX options are not even defined.
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
still those of Perl, subject to the setting of various PCRE options, as
described below.
The header for these functions is supplied as \fBpcreposix.h\fR to avoid any
potential clash with other POSIX libraries. It can, of course, be renamed or
aliased as \fBregex.h\fR, which is the "correct" name. It provides two
structure types, \fIregex_t\fR for compiled internal forms, and
\fIregmatch_t\fR for returning captured substrings. It also defines some
constants whose names start with "REG_"; these are used for setting options and
identifying error codes.
.SH COMPILING A PATTERN
The function \fBregcomp()\fR is called to compile a pattern into an
internal form. The pattern is a C string terminated by a binary zero, and
is passed in the argument \fIpattern\fR. The \fIpreg\fR argument is a pointer
to a regex_t structure which is used as a base for storing information about
the compiled expression.
The argument \fIcflags\fR is either zero, or contains one or more of the bits
defined by the following macros:
REG_ICASE
The PCRE_CASELESS option is set when the expression is passed for compilation
to the native function.
REG_NEWLINE
The PCRE_MULTILINE option is set when the expression is passed for compilation
to the native function.
In the absence of these flags, no options are passed to the native function.
This means the the regex is compiled with PCRE default semantics. In
particular, the way it handles newline characters in the subject string is the
Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
\fIsome\fR of the effects specified for REG_NEWLINE. It does not affect the way
newlines are matched by . (they aren't) or a negative class such as [^a] (they
are).
The yield of \fBregcomp()\fR is zero on success, and non-zero otherwise. The
\fIpreg\fR structure is filled in on success, and one member of the structure
is publicized: \fIre_nsub\fR contains the number of capturing subpatterns in
the regular expression. Various error codes are defined in the header file.
.SH MATCHING A PATTERN
The function \fBregexec()\fR is called to match a pre-compiled pattern
\fIpreg\fR against a given \fIstring\fR, which is terminated by a zero byte,
subject to the options in \fIeflags\fR. These can be:
REG_NOTBOL
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
function.
REG_NOTEOL
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
function.
The portion of the string that was matched, and also any captured substrings,
are returned via the \fIpmatch\fR argument, which points to an array of
\fInmatch\fR structures of type \fIregmatch_t\fR, containing the members
\fIrm_so\fR and \fIrm_eo\fR. These contain the offset to the first character of
each substring and the offset to the first character after the end of each
substring, respectively. The 0th element of the vector relates to the entire
portion of \fIstring\fR that was matched; subsequent elements relate to the
capturing subpatterns of the regular expression. Unused entries in the array
have both structure members set to -1.
A successful match yields a zero return; various error codes are defined in the
header file, of which REG_NOMATCH is the "expected" failure code.
.SH ERROR MESSAGES
The \fBregerror()\fR function maps a non-zero errorcode from either
\fBregcomp\fR or \fBregexec\fR to a printable message. If \fIpreg\fR is not
NULL, the error should have arisen from the use of that structure. A message
terminated by a binary zero is placed in \fIerrbuf\fR. The length of the
message, including the zero, is limited to \fIerrbuf_size\fR. The yield of the
function is the size of buffer needed to hold the whole message.
.SH STORAGE
Compiling a regular expression causes memory to be allocated and associated
with the \fIpreg\fR structure. The function \fBregfree()\fR frees all such
memory, after which \fIpreg\fR may no longer be used as a compiled expression.
.SH AUTHOR
Philip Hazel
.br
University Computing Service,
.br
New Museums Site,
.br
Cambridge CB2 3QG, England.
.br
Phone: +44 1223 334714
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/pcregrep.html 000640 001751 001751 00000006776 10546014100 020164 0 ustar 00fk fk 000000 000000
pcregrep specification
pcregrep specification
This HTML document has been generated automatically from the original man page.
If there is any nonsense in it, please consult the man page in case the
conversion went wrong.
pcregrep searches files for character patterns, in the same way as other
grep commands do, but it uses the PCRE regular expression library to support
patterns that are compatible with the regular expressions of Perl 5. See
pcre(3) for a full description of syntax and semantics.
If no files are specified, pcregrep reads the standard input. By default,
each line that matches the pattern is copied to the standard output, and if
there is more than one file, the file name is printed before each line of
output. However, there are options that can change how pcregrep behaves.
Lines are limited to BUFSIZ characters. BUFSIZ is defined in <stdio.h>.
The newline character is removed from the end of each line before it is matched
against the pattern.
-V
Write the version number of the PCRE library being used to the standard error
stream.
-c
Do not print individual lines; instead just print a count of the number of
lines that would otherwise have been printed. If several files are given, a
count is printed for each of them.
-h
Suppress printing of filenames when searching multiple files.
-i
Ignore upper/lower case distinctions during comparisons.
-l
Instead of printing lines from the files, just print the names of the files
containing lines that would have been printed. Each file name is printed
once, on a separate line.
-n
Precede each line by its line number in the file.
-s
Work silently, that is, display nothing except error messages.
The exit status indicates whether any matches were found.
-v
Invert the sense of the match, so that lines which do not match the
pattern are now the ones that are found.
-x
Force the pattern to be anchored (it must start matching at the beginning of
the line) and in addition, require it to match the entire line. This is
equivalent to having ^ and $ characters at the start and end of each
alternative branch in the regular expression.
Philip Hazel <ph10@cam.ac.uk>
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/news 000640 001751 001751 00000004116 10546014077 016365 0 ustar 00fk fk 000000 000000 News about PCRE releases
------------------------
Release 3.3 01-Aug-00
---------------------
There is some support for UTF-8 character strings. This is incomplete and
experimental. The documentation describes what is and what is not implemented.
Otherwise, this is just a bug-fixing release.
Release 3.0 01-Feb-00
---------------------
1. A "configure" script is now used to configure PCRE for Unix systems. It
builds a Makefile, a config.h file, and the pcre-config script.
2. PCRE is built as a shared library by default.
3. There is support for POSIX classes such as [:alpha:].
5. There is an experimental recursion feature.
----------------------------------------------------------------------------
IMPORTANT FOR THOSE UPGRADING FROM VERSIONS BEFORE 2.00
Please note that there has been a change in the API such that a larger
ovector is required at matching time, to provide some additional workspace.
The new man page has details. This change was necessary in order to support
some of the new functionality in Perl 5.005.
IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.00
Another (I hope this is the last!) change has been made to the API for the
pcre_compile() function. An additional argument has been added to make it
possible to pass over a pointer to character tables built in the current
locale by pcre_maketables(). To use the default tables, this new arguement
should be passed as NULL.
IMPORTANT FOR THOSE UPGRADING FROM VERSION 2.05
Yet another (and again I hope this really is the last) change has been made
to the API for the pcre_exec() function. An additional argument has been
added to make it possible to start the match other than at the start of the
subject string. This is important if there are lookbehinds. The new man
page has the details, but you just want to convert existing programs, all
you need to do is to stick in a new fifth argument to pcre_exec(), with a
value of zero. For example, change
pcre_exec(pattern, extra, subject, length, options, ovec, ovecsize)
to
pcre_exec(pattern, extra, subject, length, 0, options, ovec, ovecsize)
****
privoxy-3.0.21-stable/./pcre/doc/perltest.txt 000640 001751 001751 00000002745 10546014100 020062 0 ustar 00fk fk 000000 000000 The perltest program
--------------------
The perltest program tests Perl's regular expressions; it has the same
specification as pcretest, and so can be given identical input, except that
input patterns can be followed only by Perl's lower case modifiers and /+ (as
used by pcretest), which is recognized and handled by the program.
The data lines are processed as Perl double-quoted strings, so if they contain
" \ $ or @ characters, these have to be escaped. For this reason, all such
characters in testinput1 and testinput3 are escaped so that they can be used
for perltest as well as for pcretest, and the special upper case modifiers such
as /A that pcretest recognizes are not used in these files. The output should
be identical, apart from the initial identifying banner.
For testing UTF-8 features, an alternative form of perltest, called perltest8,
is supplied. This requires Perl 5.6 or higher. It recognizes the special
modifier /8 that pcretest uses to invoke UTF-8 functionality. The testinput5
file can be fed to perltest8.
The testinput2 and testinput4 files are not suitable for feeding to perltest,
since they do make use of the special upper case modifiers and escapes that
pcretest uses to test some features of PCRE. The first of these files also
contains malformed regular expressions, in order to check that PCRE diagnoses
them correctly. Similarly, testinput6 tests UTF-8 features that do not relate
to Perl.
Philip Hazel
August 2000
privoxy-3.0.21-stable/./pcre/doc/pcregrep.txt 000640 001751 001751 00000005646 10546014100 020032 0 ustar 00fk fk 000000 000000 NAME
pcregrep - a grep with Perl-compatible regular expressions.
SYNOPSIS
pcregrep [-Vchilnsvx] pattern [file] ...
DESCRIPTION
pcregrep searches files for character patterns, in the same
way as other grep commands do, but it uses the PCRE regular
expression library to support patterns that are compatible
with the regular expressions of Perl 5. See pcre(3) for a
full description of syntax and semantics.
If no files are specified, pcregrep reads the standard
input. By default, each line that matches the pattern is
copied to the standard output, and if there is more than one
file, the file name is printed before each line of output.
However, there are options that can change how pcregrep
behaves.
Lines are limited to BUFSIZ characters. BUFSIZ is defined in
. The newline character is removed from the end of
each line before it is matched against the pattern.
OPTIONS
-V Write the version number of the PCRE library being
used to the standard error stream.
-c Do not print individual lines; instead just print
a count of the number of lines that would other-
wise have been printed. If several files are
given, a count is printed for each of them.
-h Suppress printing of filenames when searching mul-
tiple files.
-i Ignore upper/lower case distinctions during com-
parisons.
-l Instead of printing lines from the files, just
print the names of the files containing lines that
would have been printed. Each file name is printed
once, on a separate line.
-n Precede each line by its line number in the file.
-s Work silently, that is, display nothing except
error messages. The exit status indicates whether
any matches were found.
-v Invert the sense of the match, so that lines which
do not match the pattern are now the ones that are
found.
-x Force the pattern to be anchored (it must start
matching at the beginning of the line) and in
addition, require it to match the entire line.
This is equivalent to having ^ and $ characters at
the start and end of each alternative branch in
the regular expression.
SEE ALSO
pcre(3), Perl 5 documentation
DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches
were found, and 2 for syntax errors or inacessible files
(even if matches were found).
AUTHOR
Philip Hazel
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/pcre.3 000640 001751 001751 00000247276 10546014100 016506 0 ustar 00fk fk 000000 000000 .TH PCRE 3
.SH NAME
pcre - Perl-compatible regular expressions.
.SH SYNOPSIS
.B #include
.PP
.SM
.br
.B pcre *pcre_compile(const char *\fIpattern\fR, int \fIoptions\fR,
.ti +5n
.B const char **\fIerrptr\fR, int *\fIerroffset\fR,
.ti +5n
.B const unsigned char *\fItableptr\fR);
.PP
.br
.B pcre_extra *pcre_study(const pcre *\fIcode\fR, int \fIoptions\fR,
.ti +5n
.B const char **\fIerrptr\fR);
.PP
.br
.B int pcre_exec(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR,"
.ti +5n
.B "const char *\fIsubject\fR," int \fIlength\fR, int \fIstartoffset\fR,
.ti +5n
.B int \fIoptions\fR, int *\fIovector\fR, int \fIovecsize\fR);
.PP
.br
.B int pcre_copy_substring(const char *\fIsubject\fR, int *\fIovector\fR,
.ti +5n
.B int \fIstringcount\fR, int \fIstringnumber\fR, char *\fIbuffer\fR,
.ti +5n
.B int \fIbuffersize\fR);
.PP
.br
.B int pcre_get_substring(const char *\fIsubject\fR, int *\fIovector\fR,
.ti +5n
.B int \fIstringcount\fR, int \fIstringnumber\fR,
.ti +5n
.B const char **\fIstringptr\fR);
.PP
.br
.B int pcre_get_substring_list(const char *\fIsubject\fR,
.ti +5n
.B int *\fIovector\fR, int \fIstringcount\fR, "const char ***\fIlistptr\fR);"
.PP
.br
.B void pcre_free_substring(const char *\fIstringptr\fR);
.PP
.br
.B void pcre_free_substring_list(const char **\fIstringptr\fR);
.PP
.br
.B const unsigned char *pcre_maketables(void);
.PP
.br
.B int pcre_fullinfo(const pcre *\fIcode\fR, "const pcre_extra *\fIextra\fR,"
.ti +5n
.B int \fIwhat\fR, void *\fIwhere\fR);
.PP
.br
.B int pcre_info(const pcre *\fIcode\fR, int *\fIoptptr\fR, int
.B *\fIfirstcharptr\fR);
.PP
.br
.B char *pcre_version(void);
.PP
.br
.B void *(*pcre_malloc)(size_t);
.PP
.br
.B void (*pcre_free)(void *);
.SH DESCRIPTION
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl 5, with just a few
differences (see below). The current implementation corresponds to Perl 5.005,
with some additional features from later versions. This includes some
experimental, incomplete support for UTF-8 encoded strings. Details of exactly
what is and what is not supported are given below.
PCRE has its own native API, which is described in this document. There is also
a set of wrapper functions that correspond to the POSIX regular expression API.
These are described in the \fBpcreposix\fR documentation.
The native API function prototypes are defined in the header file \fBpcre.h\fR,
and on Unix systems the library itself is called \fBlibpcre.a\fR, so can be
accessed by adding \fB-lpcre\fR to the command for linking an application which
calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
contain the major and minor release numbers for the library. Applications can
use these to include support for different releases.
The functions \fBpcre_compile()\fR, \fBpcre_study()\fR, and \fBpcre_exec()\fR
are used for compiling and matching regular expressions.
The functions \fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
\fBpcre_get_substring_list()\fR are convenience functions for extracting
captured substrings from a matched subject string; \fBpcre_free_substring()\fR
and \fBpcre_free_substring_list()\fR are also provided, to free the memory used
for extracted strings.
The function \fBpcre_maketables()\fR is used (optionally) to build a set of
character tables in the current locale for passing to \fBpcre_compile()\fR.
The function \fBpcre_fullinfo()\fR is used to find out information about a
compiled pattern; \fBpcre_info()\fR is an obsolete version which returns only
some of the available information, but is retained for backwards compatibility.
The function \fBpcre_version()\fR returns a pointer to a string containing the
version of PCRE and its date of release.
The global variables \fBpcre_malloc\fR and \fBpcre_free\fR initially contain
the entry points of the standard \fBmalloc()\fR and \fBfree()\fR functions
respectively. PCRE calls the memory management functions via these variables,
so a calling program can replace them if it wishes to intercept the calls. This
should be done before calling any PCRE functions.
.SH MULTI-THREADING
The PCRE functions can be used in multi-threading applications, with the
proviso that the memory management functions pointed to by \fBpcre_malloc\fR
and \fBpcre_free\fR are shared by all threads.
The compiled form of a regular expression is not altered during matching, so
the same compiled pattern can safely be used by several threads at once.
.SH COMPILING A PATTERN
The function \fBpcre_compile()\fR is called to compile a pattern into an
internal form. The pattern is a C string terminated by a binary zero, and
is passed in the argument \fIpattern\fR. A pointer to a single block of memory
that is obtained via \fBpcre_malloc\fR is returned. This contains the
compiled code and related data. The \fBpcre\fR type is defined for this for
convenience, but in fact \fBpcre\fR is just a typedef for \fBvoid\fR, since the
contents of the block are not externally defined. It is up to the caller to
free the memory when it is no longer required.
.PP
The size of a compiled pattern is roughly proportional to the length of the
pattern string, except that each character class (other than those containing
just a single character, negated or not) requires 33 bytes, and repeat
quantifiers with a minimum greater than one or a bounded maximum cause the
relevant portions of the compiled pattern to be replicated.
.PP
The \fIoptions\fR argument contains independent bits that affect the
compilation. It should be zero if no options are required. Some of the options,
in particular, those that are compatible with Perl, can also be set and unset
from within the pattern (see the detailed description of regular expressions
below). For these options, the contents of the \fIoptions\fR argument specifies
their initial settings at the start of compilation and execution. The
PCRE_ANCHORED option can be set at the time of matching as well as at compile
time.
.PP
If \fIerrptr\fR is NULL, \fBpcre_compile()\fR returns NULL immediately.
Otherwise, if compilation of a pattern fails, \fBpcre_compile()\fR returns
NULL, and sets the variable pointed to by \fIerrptr\fR to point to a textual
error message. The offset from the start of the pattern to the character where
the error was discovered is placed in the variable pointed to by
\fIerroffset\fR, which must not be NULL. If it is, an immediate error is given.
.PP
If the final argument, \fItableptr\fR, is NULL, PCRE uses a default set of
character tables which are built when it is compiled, using the default C
locale. Otherwise, \fItableptr\fR must be the result of a call to
\fBpcre_maketables()\fR. See the section on locale support below.
.PP
The following option bits are defined in the header file:
PCRE_ANCHORED
If this bit is set, the pattern is forced to be "anchored", that is, it is
constrained to match only at the start of the string which is being searched
(the "subject string"). This effect can also be achieved by appropriate
constructs in the pattern itself, which is the only way to do it in Perl.
PCRE_CASELESS
If this bit is set, letters in the pattern match both upper and lower case
letters. It is equivalent to Perl's /i option.
PCRE_DOLLAR_ENDONLY
If this bit is set, a dollar metacharacter in the pattern matches only at the
end of the subject string. Without this option, a dollar also matches
immediately before the final character if it is a newline (but not before any
other newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
set. There is no equivalent to this option in Perl.
PCRE_DOTALL
If this bit is set, a dot metacharater in the pattern matches all characters,
including newlines. Without it, newlines are excluded. This option is
equivalent to Perl's /s option. A negative class such as [^a] always matches a
newline character, independent of the setting of this option.
PCRE_EXTENDED
If this bit is set, whitespace data characters in the pattern are totally
ignored except when escaped or inside a character class, and characters between
an unescaped # outside a character class and the next newline character,
inclusive, are also ignored. This is equivalent to Perl's /x option, and makes
it possible to include comments inside complicated patterns. Note, however,
that this applies only to data characters. Whitespace characters may never
appear within special character sequences in a pattern, for example within the
sequence (?( which introduces a conditional subpattern.
PCRE_EXTRA
This option was invented in order to turn on additional functionality of PCRE
that is incompatible with Perl, but it is currently of very little use. When
set, any backslash in a pattern that is followed by a letter that has no
special meaning causes an error, thus reserving these combinations for future
expansion. By default, as in Perl, a backslash followed by a letter with no
special meaning is treated as a literal. There are at present no other features
controlled by this option. It can also be set by a (?X) option setting within a
pattern.
PCRE_MULTILINE
By default, PCRE treats the subject string as consisting of a single "line" of
characters (even if it actually contains several newlines). The "start of line"
metacharacter (^) matches only at the start of the string, while the "end of
line" metacharacter ($) matches only at the end of the string, or before a
terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
Perl.
When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
match immediately following or immediately before any newline in the subject
string, respectively, as well as at the very start and end. This is equivalent
to Perl's /m option. If there are no "\\n" characters in a subject string, or
no occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no
effect.
PCRE_UNGREEDY
This option inverts the "greediness" of the quantifiers so that they are not
greedy by default, but become greedy if followed by "?". It is not compatible
with Perl. It can also be set by a (?U) option setting within the pattern.
PCRE_UTF8
This option causes PCRE to regard both the pattern and the subject as strings
of UTF-8 characters instead of just byte strings. However, it is available only
if PCRE has been built to include UTF-8 support. If not, the use of this option
provokes an error. Support for UTF-8 is new, experimental, and incomplete.
Details of exactly what it entails are given below.
.SH STUDYING A PATTERN
When a pattern is going to be used several times, it is worth spending more
time analyzing it in order to speed up the time taken for matching. The
function \fBpcre_study()\fR takes a pointer to a compiled pattern as its first
argument, and returns a pointer to a \fBpcre_extra\fR block (another \fBvoid\fR
typedef) containing additional information about the pattern; this can be
passed to \fBpcre_exec()\fR. If no additional information is available, NULL
is returned.
The second argument contains option bits. At present, no options are defined
for \fBpcre_study()\fR, and this argument should always be zero.
The third argument for \fBpcre_study()\fR is a pointer to an error message. If
studying succeeds (even if no data is returned), the variable it points to is
set to NULL. Otherwise it points to a textual error message.
At present, studying a pattern is useful only for non-anchored patterns that do
not have a single fixed starting character. A bitmap of possible starting
characters is created.
.SH LOCALE SUPPORT
PCRE handles caseless matching, and determines whether characters are letters,
digits, or whatever, by reference to a set of tables. The library contains a
default set of tables which is created in the default C locale when PCRE is
compiled. This is used when the final argument of \fBpcre_compile()\fR is NULL,
and is sufficient for many applications.
An alternative set of tables can, however, be supplied. Such tables are built
by calling the \fBpcre_maketables()\fR function, which has no arguments, in the
relevant locale. The result can then be passed to \fBpcre_compile()\fR as often
as necessary. For example, to build and use tables that are appropriate for the
French locale (where accented characters with codes greater than 128 are
treated as letters), the following code could be used:
setlocale(LC_CTYPE, "fr");
tables = pcre_maketables();
re = pcre_compile(..., tables);
The tables are built in memory that is obtained via \fBpcre_malloc\fR. The
pointer that is passed to \fBpcre_compile\fR is saved with the compiled
pattern, and the same tables are used via this pointer by \fBpcre_study()\fR
and \fBpcre_exec()\fR. Thus for any single pattern, compilation, studying and
matching all happen in the same locale, but different patterns can be compiled
in different locales. It is the caller's responsibility to ensure that the
memory containing the tables remains available for as long as it is needed.
.SH INFORMATION ABOUT A PATTERN
The \fBpcre_fullinfo()\fR function returns information about a compiled
pattern. It replaces the obsolete \fBpcre_info()\fR function, which is
nevertheless retained for backwards compability (and is documented below).
The first argument for \fBpcre_fullinfo()\fR is a pointer to the compiled
pattern. The second argument is the result of \fBpcre_study()\fR, or NULL if
the pattern was not studied. The third argument specifies which piece of
information is required, while the fourth argument is a pointer to a variable
to receive the data. The yield of the function is zero for success, or one of
the following negative numbers:
PCRE_ERROR_NULL the argument \fIcode\fR was NULL
the argument \fIwhere\fR was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of \fIwhat\fR was invalid
The possible values for the third argument are defined in \fBpcre.h\fR, and are
as follows:
PCRE_INFO_OPTIONS
Return a copy of the options with which the pattern was compiled. The fourth
argument should point to au \fBunsigned long int\fR variable. These option bits
are those specified in the call to \fBpcre_compile()\fR, modified by any
top-level option settings within the pattern itself, and with the PCRE_ANCHORED
bit forcibly set if the form of the pattern implies that it can match only at
the start of a subject string.
PCRE_INFO_SIZE
Return the size of the compiled pattern, that is, the value that was passed as
the argument to \fBpcre_malloc()\fR when PCRE was getting memory in which to
place the compiled data. The fourth argument should point to a \fBsize_t\fR
variable.
PCRE_INFO_CAPTURECOUNT
Return the number of capturing subpatterns in the pattern. The fourth argument
should point to an \fbint\fR variable.
PCRE_INFO_BACKREFMAX
Return the number of the highest back reference in the pattern. The fourth
argument should point to an \fBint\fR variable. Zero is returned if there are
no back references.
PCRE_INFO_FIRSTCHAR
Return information about the first character of any matched string, for a
non-anchored pattern. If there is a fixed first character, e.g. from a pattern
such as (cat|cow|coyote), it is returned in the integer pointed to by
\fIwhere\fR. Otherwise, if either
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
starts with "^", or
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
(if it were set, the pattern would be anchored),
-1 is returned, indicating that the pattern matches only at the start of a
subject string or after any "\\n" within the string. Otherwise -2 is returned.
For anchored patterns, -2 is returned.
PCRE_INFO_FIRSTTABLE
If the pattern was studied, and this resulted in the construction of a 256-bit
table indicating a fixed set of characters for the first character in any
matching string, a pointer to the table is returned. Otherwise NULL is
returned. The fourth argument should point to an \fBunsigned char *\fR
variable.
PCRE_INFO_LASTLITERAL
For a non-anchored pattern, return the value of the rightmost literal character
which must exist in any matched string, other than at its start. The fourth
argument should point to an \fBint\fR variable. If there is no such character,
or if the pattern is anchored, -1 is returned. For example, for the pattern
/a\\d+z\\d+/ the returned value is 'z'.
The \fBpcre_info()\fR function is now obsolete because its interface is too
restrictive to return all the available data about a compiled pattern. New
programs should use \fBpcre_fullinfo()\fR instead. The yield of
\fBpcre_info()\fR is the number of capturing subpatterns, or one of the
following negative numbers:
PCRE_ERROR_NULL the argument \fIcode\fR was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
If the \fIoptptr\fR argument is not NULL, a copy of the options with which the
pattern was compiled is placed in the integer it points to (see
PCRE_INFO_OPTIONS above).
If the pattern is not anchored and the \fIfirstcharptr\fR argument is not NULL,
it is used to pass back information about the first character of any matched
string (see PCRE_INFO_FIRSTCHAR above).
.SH MATCHING A PATTERN
The function \fBpcre_exec()\fR is called to match a subject string against a
pre-compiled pattern, which is passed in the \fIcode\fR argument. If the
pattern has been studied, the result of the study should be passed in the
\fIextra\fR argument. Otherwise this must be NULL.
The PCRE_ANCHORED option can be passed in the \fIoptions\fR argument, whose
unused bits must be zero. However, if a pattern was compiled with
PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it
cannot be made unachored at matching time.
There are also three further options that can be set only at matching time:
PCRE_NOTBOL
The first character of the string is not the beginning of a line, so the
circumflex metacharacter should not match before it. Setting this without
PCRE_MULTILINE (at compile time) causes circumflex never to match.
PCRE_NOTEOL
The end of the string is not the end of a line, so the dollar metacharacter
should not match it nor (except in multiline mode) a newline immediately before
it. Setting this without PCRE_MULTILINE (at compile time) causes dollar never
to match.
PCRE_NOTEMPTY
An empty string is not considered to be a valid match if this option is set. If
there are alternatives in the pattern, they are tried. If all the alternatives
match the empty string, the entire match fails. For example, if the pattern
a?b?
is applied to a string not beginning with "a" or "b", it matches the empty
string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
valid, so PCRE searches further into the string for occurrences of "a" or "b".
Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
of a pattern match of the empty string within its \fBsplit()\fR function, and
when using the /g modifier. It is possible to emulate Perl's behaviour after
matching a null string by first trying the match again at the same offset with
PCRE_NOTEMPTY set, and then if that fails by advancing the starting offset (see
below) and trying an ordinary match again.
The subject string is passed as a pointer in \fIsubject\fR, a length in
\fIlength\fR, and a starting offset in \fIstartoffset\fR. Unlike the pattern
string, it may contain binary zero characters. When the starting offset is
zero, the search for a match starts at the beginning of the subject, and this
is by far the most common case.
A non-zero starting offset is useful when searching for another match in the
same subject by calling \fBpcre_exec()\fR again after a previous success.
Setting \fIstartoffset\fR differs from just passing over a shortened string and
setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
lookbehind. For example, consider the pattern
\\Biss\\B
which finds occurrences of "iss" in the middle of words. (\\B matches only if
the current position in the subject is not a word boundary.) When applied to
the string "Mississipi" the first call to \fBpcre_exec()\fR finds the first
occurrence. If \fBpcre_exec()\fR is called again with just the remainder of the
subject, namely "issipi", it does not match, because \\B is always false at the
start of the subject, which is deemed to be a word boundary. However, if
\fBpcre_exec()\fR is passed the entire string again, but with \fIstartoffset\fR
set to 4, it finds the second occurrence of "iss" because it is able to look
behind the starting point to discover that it is preceded by a letter.
If a non-zero starting offset is passed when the pattern is anchored, one
attempt to match at the given offset is tried. This can only succeed if the
pattern does not require the match to be at the start of the subject.
In general, a pattern matches a certain portion of the subject, and in
addition, further substrings from the subject may be picked out by parts of the
pattern. Following the usage in Jeffrey Friedl's book, this is called
"capturing" in what follows, and the phrase "capturing subpattern" is used for
a fragment of a pattern that picks out a substring. PCRE supports several other
kinds of parenthesized subpattern that do not cause substrings to be captured.
Captured substrings are returned to the caller via a vector of integer offsets
whose address is passed in \fIovector\fR. The number of elements in the vector
is passed in \fIovecsize\fR. The first two-thirds of the vector is used to pass
back captured substrings, each substring using a pair of integers. The
remaining third of the vector is used as workspace by \fBpcre_exec()\fR while
matching capturing subpatterns, and is not available for passing back
information. The length passed in \fIovecsize\fR should always be a multiple of
three. If it is not, it is rounded down.
When a match has been successful, information about captured substrings is
returned in pairs of integers, starting at the beginning of \fIovector\fR, and
continuing up to two-thirds of its length at the most. The first element of a
pair is set to the offset of the first character in a substring, and the second
is set to the offset of the first character after the end of a substring. The
first pair, \fIovector[0]\fR and \fIovector[1]\fR, identify the portion of the
subject string matched by the entire pattern. The next pair is used for the
first capturing subpattern, and so on. The value returned by \fBpcre_exec()\fR
is the number of pairs that have been set. If there are no capturing
subpatterns, the return value from a successful match is 1, indicating that
just the first pair of offsets has been set.
Some convenience functions are provided for extracting the captured substrings
as separate strings. These are described in the following section.
It is possible for an capturing subpattern number \fIn+1\fR to match some
part of the subject when subpattern \fIn\fR has not been used at all. For
example, if the string "abc" is matched against the pattern (a|(z))(bc)
subpatterns 1 and 3 are matched, but 2 is not. When this happens, both offset
values corresponding to the unused subpattern are set to -1.
If a capturing subpattern is matched repeatedly, it is the last portion of the
string that it matched that gets returned.
If the vector is too small to hold all the captured substrings, it is used as
far as possible (up to two-thirds of its length), and the function returns a
value of zero. In particular, if the substring offsets are not of interest,
\fBpcre_exec()\fR may be called with \fIovector\fR passed as NULL and
\fIovecsize\fR as zero. However, if the pattern contains back references and
the \fIovector\fR isn't big enough to remember the related substrings, PCRE has
to get additional memory for use during matching. Thus it is usually advisable
to supply an \fIovector\fR.
Note that \fBpcre_info()\fR can be used to find out how many capturing
subpatterns there are in a compiled pattern. The smallest size for
\fIovector\fR that will allow for \fIn\fR captured substrings in addition to
the offsets of the substring matched by the whole pattern is (\fIn\fR+1)*3.
If \fBpcre_exec()\fR fails, it returns a negative number. The following are
defined in the header file:
PCRE_ERROR_NOMATCH (-1)
The subject string did not match the pattern.
PCRE_ERROR_NULL (-2)
Either \fIcode\fR or \fIsubject\fR was passed as NULL, or \fIovector\fR was
NULL and \fIovecsize\fR was not zero.
PCRE_ERROR_BADOPTION (-3)
An unrecognized bit was set in the \fIoptions\fR argument.
PCRE_ERROR_BADMAGIC (-4)
PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
the case when it is passed a junk pointer. This is the error it gives when the
magic number isn't present.
PCRE_ERROR_UNKNOWN_NODE (-5)
While running the pattern match, an unknown item was encountered in the
compiled pattern. This error could be caused by a bug in PCRE or by overwriting
of the compiled pattern.
PCRE_ERROR_NOMEMORY (-6)
If a pattern contains back references, but the \fIovector\fR that is passed to
\fBpcre_exec()\fR is not big enough to remember the referenced substrings, PCRE
gets a block of memory at the start of matching to use for this purpose. If the
call via \fBpcre_malloc()\fR fails, this error is given. The memory is freed at
the end of matching.
.SH EXTRACTING CAPTURED SUBSTRINGS
Captured substrings can be accessed directly by using the offsets returned by
\fBpcre_exec()\fR in \fIovector\fR. For convenience, the functions
\fBpcre_copy_substring()\fR, \fBpcre_get_substring()\fR, and
\fBpcre_get_substring_list()\fR are provided for extracting captured substrings
as new, separate, zero-terminated strings. A substring that contains a binary
zero is correctly extracted and has a further zero added on the end, but the
result does not, of course, function as a C string.
The first three arguments are the same for all three functions: \fIsubject\fR
is the subject string which has just been successfully matched, \fIovector\fR
is a pointer to the vector of integer offsets that was passed to
\fBpcre_exec()\fR, and \fIstringcount\fR is the number of substrings that
were captured by the match, including the substring that matched the entire
regular expression. This is the value returned by \fBpcre_exec\fR if it
is greater than zero. If \fBpcre_exec()\fR returned zero, indicating that it
ran out of space in \fIovector\fR, the value passed as \fIstringcount\fR should
be the size of the vector divided by three.
The functions \fBpcre_copy_substring()\fR and \fBpcre_get_substring()\fR
extract a single substring, whose number is given as \fIstringnumber\fR. A
value of zero extracts the substring that matched the entire pattern, while
higher values extract the captured substrings. For \fBpcre_copy_substring()\fR,
the string is placed in \fIbuffer\fR, whose length is given by
\fIbuffersize\fR, while for \fBpcre_get_substring()\fR a new block of memory is
obtained via \fBpcre_malloc\fR, and its address is returned via
\fIstringptr\fR. The yield of the function is the length of the string, not
including the terminating zero, or one of
PCRE_ERROR_NOMEMORY (-6)
The buffer was too small for \fBpcre_copy_substring()\fR, or the attempt to get
memory failed for \fBpcre_get_substring()\fR.
PCRE_ERROR_NOSUBSTRING (-7)
There is no substring whose number is \fIstringnumber\fR.
The \fBpcre_get_substring_list()\fR function extracts all available substrings
and builds a list of pointers to them. All this is done in a single block of
memory which is obtained via \fBpcre_malloc\fR. The address of the memory block
is returned via \fIlistptr\fR, which is also the start of the list of string
pointers. The end of the list is marked by a NULL pointer. The yield of the
function is zero if all went well, or
PCRE_ERROR_NOMEMORY (-6)
if the attempt to get the memory block failed.
When any of these functions encounter a substring that is unset, which can
happen when capturing subpattern number \fIn+1\fR matches some part of the
subject, but subpattern \fIn\fR has not been used at all, they return an empty
string. This can be distinguished from a genuine zero-length substring by
inspecting the appropriate offset in \fIovector\fR, which is negative for unset
substrings.
The two convenience functions \fBpcre_free_substring()\fR and
\fBpcre_free_substring_list()\fR can be used to free the memory returned by
a previous call of \fBpcre_get_substring()\fR or
\fBpcre_get_substring_list()\fR, respectively. They do nothing more than call
the function pointed to by \fBpcre_free\fR, which of course could be called
directly from a C program. However, PCRE is used in some situations where it is
linked via a special interface to another programming language which cannot use
\fBpcre_free\fR directly; it is for these cases that the functions are
provided.
.SH LIMITATIONS
There are some size limitations in PCRE but it is hoped that they will never in
practice be relevant.
The maximum length of a compiled pattern is 65539 (sic) bytes.
All values in repeating quantifiers must be less than 65536.
The maximum number of capturing subpatterns is 99.
The maximum number of all parenthesized subpatterns, including capturing
subpatterns, assertions, and other types of subpattern, is 200.
The maximum length of a subject string is the largest positive number that an
integer variable can hold. However, PCRE uses recursion to handle subpatterns
and indefinite repetition. This means that the available stack space may limit
the size of a subject string that can be processed by certain patterns.
.SH DIFFERENCES FROM PERL
The differences described here are with respect to Perl 5.005.
1. By default, a whitespace character is any character that the C library
function \fBisspace()\fR recognizes, though it is possible to compile PCRE with
alternative character type tables. Normally \fBisspace()\fR matches space,
formfeed, newline, carriage return, horizontal tab, and vertical tab. Perl 5
no longer includes vertical tab in its set of whitespace characters. The \\v
escape that was in the Perl documentation for a long time was never in fact
recognized. However, the character itself was treated as whitespace at least
up to 5.002. In 5.004 and 5.005 it does not match \\s.
2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
them, but they do not mean what you might think. For example, (?!a){3} does
not assert that the next three characters are not "a". It just asserts that the
next character is not "a" three times.
3. Capturing subpatterns that occur inside negative lookahead assertions are
counted, but their entries in the offsets vector are never set. Perl sets its
numerical variables from any such patterns that are matched before the
assertion fails to match something (thereby succeeding), but only if the
negative lookahead assertion contains just one branch.
4. Though binary zero characters are supported in the subject string, they are
not allowed in a pattern string because it is passed as a normal C string,
terminated by zero. The escape sequence "\\0" can be used in the pattern to
represent a binary zero.
5. The following Perl escape sequences are not supported: \\l, \\u, \\L, \\U,
\\E, \\Q. In fact these are implemented by Perl's general string-handling and
are not part of its pattern matching engine.
6. The Perl \\G assertion is not supported as it is not relevant to single
pattern matches.
7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
constructions. However, there is some experimental support for recursive
patterns using the non-Perl item (?R).
8. There are at the time of writing some oddities in Perl 5.005_02 concerned
with the settings of captured strings when part of a pattern is repeated. For
example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if
the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) are set.
In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the
future Perl changes to a consistent state that is different, PCRE may change to
follow.
9. Another as yet unresolved discrepancy is that in Perl 5.005_02 the pattern
/^(a)?(?(1)a|b)+$/ matches the string "a", whereas in PCRE it does not.
However, in both Perl and PCRE /^(a)?a/ matched against "a" leaves $1 unset.
10. PCRE provides some extensions to the Perl regular expression facilities:
(a) Although lookbehind assertions must match fixed length strings, each
alternative branch of a lookbehind assertion can match a different length of
string. Perl 5.005 requires them all to have the same length.
(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $ meta-
character matches only at the very end of the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
meaning is faulted.
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
inverted, that is, by default they are not greedy, but if followed by a
question mark they are.
(e) PCRE_ANCHORED can be used to force a pattern to be tried only at the start
of the subject.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
\fBpcre_exec()\fR have no Perl equivalents.
(g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
this using the (?p{code}) construct, which PCRE cannot of course support.)
.SH REGULAR EXPRESSION DETAILS
The syntax and semantics of the regular expressions supported by PCRE are
described below. Regular expressions are also described in the Perl
documentation and in a number of other books, some of which have copious
examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
O'Reilly (ISBN 1-56592-257), covers them in great detail.
The description here is intended as reference documentation. The basic
operation of PCRE is on strings of bytes. However, there is the beginnings of
some support for UTF-8 character strings. To use this support you must
configure PCRE to include it, and then call \fBpcre_compile()\fR with the
PCRE_UTF8 option. How this affects the pattern matching is described in the
final section of this document.
A regular expression is a pattern that is matched against a subject string from
left to right. Most characters stand for themselves in a pattern, and match the
corresponding characters in the subject. As a trivial example, the pattern
The quick brown fox
matches a portion of a subject string that is identical to itself. The power of
regular expressions comes from the ability to include alternatives and
repetitions in the pattern. These are encoded in the pattern by the use of
\fImeta-characters\fR, which do not stand for themselves but instead are
interpreted in some special way.
There are two different sets of meta-characters: those that are recognized
anywhere in the pattern except within square brackets, and those that are
recognized in square brackets. Outside square brackets, the meta-characters are
as follows:
\\ general escape character with several uses
^ assert start of subject (or line, in multiline mode)
$ assert end of subject (or line, in multiline mode)
. match any character except newline (by default)
[ start character class definition
| start of alternative branch
( start subpattern
) end subpattern
? extends the meaning of (
also 0 or 1 quantifier
also quantifier minimizer
* 0 or more quantifier
+ 1 or more quantifier
{ start min/max quantifier
Part of a pattern that is in square brackets is called a "character class". In
a character class the only meta-characters are:
\\ general escape character
^ negate the class, but only if the first character
- indicates character range
] terminates the character class
The following sections describe the use of each of the meta-characters.
.SH BACKSLASH
The backslash character has several uses. Firstly, if it is followed by a
non-alphameric character, it takes away any special meaning that character may
have. This use of backslash as an escape character applies both inside and
outside character classes.
For example, if you want to match a "*" character, you write "\\*" in the
pattern. This applies whether or not the following character would otherwise be
interpreted as a meta-character, so it is always safe to precede a
non-alphameric with "\\" to specify that it stands for itself. In particular,
if you want to match a backslash, you write "\\\\".
If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
pattern (other than in a character class) and characters between a "#" outside
a character class and the next newline character are ignored. An escaping
backslash can be used to include a whitespace or "#" character as part of the
pattern.
A second use of backslash provides a way of encoding non-printing characters
in patterns in a visible manner. There is no restriction on the appearance of
non-printing characters, apart from the binary zero that terminates a pattern,
but when a pattern is being prepared by text editing, it is usually easier to
use one of the following escape sequences than the binary character it
represents:
\\a alarm, that is, the BEL character (hex 07)
\\cx "control-x", where x is any character
\\e escape (hex 1B)
\\f formfeed (hex 0C)
\\n newline (hex 0A)
\\r carriage return (hex 0D)
\\t tab (hex 09)
\\xhh character with hex code hh
\\ddd character with octal code ddd, or backreference
The precise effect of "\\cx" is as follows: if "x" is a lower case letter, it
is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
Thus "\\cz" becomes hex 1A, but "\\c{" becomes hex 3B, while "\\c;" becomes hex
7B.
After "\\x", up to two hexadecimal digits are read (letters can be in upper or
lower case).
After "\\0" up to two further octal digits are read. In both cases, if there
are fewer than two digits, just those that are present are used. Thus the
sequence "\\0\\x\\07" specifies two binary zeros followed by a BEL character.
Make sure you supply two digits after the initial zero if the character that
follows is itself an octal digit.
The handling of a backslash followed by a digit other than 0 is complicated.
Outside a character class, PCRE reads it and any following digits as a decimal
number. If the number is less than 10, or if there have been at least that many
previous capturing left parentheses in the expression, the entire sequence is
taken as a \fIback reference\fR. A description of how this works is given
later, following the discussion of parenthesized subpatterns.
Inside a character class, or if the decimal number is greater than 9 and there
have not been that many capturing subpatterns, PCRE re-reads up to three octal
digits following the backslash, and generates a single byte from the least
significant 8 bits of the value. Any subsequent digits stand for themselves.
For example:
\\040 is another way of writing a space
\\40 is the same, provided there are fewer than 40
previous capturing subpatterns
\\7 is always a back reference
\\11 might be a back reference, or another way of
writing a tab
\\011 is always a tab
\\0113 is a tab followed by the character "3"
\\113 is the character with octal code 113 (since there
can be no more than 99 back references)
\\377 is a byte consisting entirely of 1 bits
\\81 is either a back reference, or a binary zero
followed by the two characters "8" and "1"
Note that octal values of 100 or greater must not be introduced by a leading
zero, because no more than three octal digits are ever read.
All the sequences that define a single byte value can be used both inside and
outside character classes. In addition, inside a character class, the sequence
"\\b" is interpreted as the backspace character (hex 08). Outside a character
class it has a different meaning (see below).
The third use of backslash is for specifying generic character types:
\\d any decimal digit
\\D any character that is not a decimal digit
\\s any whitespace character
\\S any character that is not a whitespace character
\\w any "word" character
\\W any "non-word" character
Each pair of escape sequences partitions the complete set of characters into
two disjoint sets. Any given character matches one, and only one, of each pair.
A "word" character is any letter or digit or the underscore character, that is,
any character which can be part of a Perl "word". The definition of letters and
digits is controlled by PCRE's character tables, and may vary if locale-
specific matching is taking place (see "Locale support" above). For example, in
the "fr" (French) locale, some character codes greater than 128 are used for
accented letters, and these are matched by \\w.
These character type sequences can appear both inside and outside character
classes. They each match one character of the appropriate type. If the current
matching point is at the end of the subject string, all of them fail, since
there is no character to match.
The fourth use of backslash is for certain simple assertions. An assertion
specifies a condition that has to be met at a particular point in a match,
without consuming any characters from the subject string. The use of
subpatterns for more complicated assertions is described below. The backslashed
assertions are
\\b word boundary
\\B not a word boundary
\\A start of subject (independent of multiline mode)
\\Z end of subject or newline at end (independent of multiline mode)
\\z end of subject (independent of multiline mode)
These assertions may not appear in character classes (but note that "\\b" has a
different meaning, namely the backspace character, inside a character class).
A word boundary is a position in the subject string where the current character
and the previous character do not both match \\w or \\W (i.e. one matches
\\w and the other matches \\W), or the start or end of the string if the
first or last character matches \\w, respectively.
The \\A, \\Z, and \\z assertions differ from the traditional circumflex and
dollar (described below) in that they only ever match at the very start and end
of the subject string, whatever options are set. They are not affected by the
PCRE_NOTBOL or PCRE_NOTEOL options. If the \fIstartoffset\fR argument of
\fBpcre_exec()\fR is non-zero, \\A can never match. The difference between \\Z
and \\z is that \\Z matches before a newline that is the last character of the
string as well as at the end of the string, whereas \\z matches only at the
end.
.SH CIRCUMFLEX AND DOLLAR
Outside a character class, in the default matching mode, the circumflex
character is an assertion which is true only if the current matching point is
at the start of the subject string. If the \fIstartoffset\fR argument of
\fBpcre_exec()\fR is non-zero, circumflex can never match. Inside a character
class, circumflex has an entirely different meaning (see below).
Circumflex need not be the first character of the pattern if a number of
alternatives are involved, but it should be the first thing in each alternative
in which it appears if the pattern is ever to match that branch. If all
possible alternatives start with a circumflex, that is, if the pattern is
constrained to match only at the start of the subject, it is said to be an
"anchored" pattern. (There are also other constructs that can cause a pattern
to be anchored.)
A dollar character is an assertion which is true only if the current matching
point is at the end of the subject string, or immediately before a newline
character that is the last character in the string (by default). Dollar need
not be the last character of the pattern if a number of alternatives are
involved, but it should be the last item in any branch in which it appears.
Dollar has no special meaning in a character class.
The meaning of dollar can be changed so that it matches only at the very end of
the string, by setting the PCRE_DOLLAR_ENDONLY option at compile or matching
time. This does not affect the \\Z assertion.
The meanings of the circumflex and dollar characters are changed if the
PCRE_MULTILINE option is set. When this is the case, they match immediately
after and immediately before an internal "\\n" character, respectively, in
addition to matching at the start and end of the subject string. For example,
the pattern /^abc$/ matches the subject string "def\\nabc" in multiline mode,
but not otherwise. Consequently, patterns that are anchored in single line mode
because all branches start with "^" are not anchored in multiline mode, and a
match for circumflex is possible when the \fIstartoffset\fR argument of
\fBpcre_exec()\fR is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
PCRE_MULTILINE is set.
Note that the sequences \\A, \\Z, and \\z can be used to match the start and
end of the subject in both modes, and if all branches of a pattern start with
\\A is it always anchored, whether PCRE_MULTILINE is set or not.
.SH FULL STOP (PERIOD, DOT)
Outside a character class, a dot in the pattern matches any one character in
the subject, including a non-printing character, but not (by default) newline.
If the PCRE_DOTALL option is set, dots match newlines as well. The handling of
dot is entirely independent of the handling of circumflex and dollar, the only
relationship being that they both involve newline characters. Dot has no
special meaning in a character class.
.SH SQUARE BRACKETS
An opening square bracket introduces a character class, terminated by a closing
square bracket. A closing square bracket on its own is not special. If a
closing square bracket is required as a member of the class, it should be the
first data character in the class (after an initial circumflex, if present) or
escaped with a backslash.
A character class matches a single character in the subject; the character must
be in the set of characters defined by the class, unless the first character in
the class is a circumflex, in which case the subject character must not be in
the set defined by the class. If a circumflex is actually required as a member
of the class, ensure it is not the first character, or escape it with a
backslash.
For example, the character class [aeiou] matches any lower case vowel, while
[^aeiou] matches any character that is not a lower case vowel. Note that a
circumflex is just a convenient notation for specifying the characters which
are in the class by enumerating those that are not. It is not an assertion: it
still consumes a character from the subject string, and fails if the current
pointer is at the end of the string.
When caseless matching is set, any letters in a class represent both their
upper case and lower case versions, so for example, a caseless [aeiou] matches
"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
caseful version would.
The newline character is never treated in any special way in character classes,
whatever the setting of the PCRE_DOTALL or PCRE_MULTILINE options is. A class
such as [^a] will always match a newline.
The minus (hyphen) character can be used to specify a range of characters in a
character class. For example, [d-m] matches any letter between d and m,
inclusive. If a minus character is required in a class, it must be escaped with
a backslash or appear in a position where it cannot be interpreted as
indicating a range, typically as the first or last character in the class.
It is not possible to have the literal character "]" as the end character of a
range. A pattern such as [W-]46] is interpreted as a class of two characters
("W" and "-") followed by a literal string "46]", so it would match "W46]" or
"-46]". However, if the "]" is escaped with a backslash it is interpreted as
the end of range, so [W-\\]46] is interpreted as a single class containing a
range followed by two separate characters. The octal or hexadecimal
representation of "]" can also be used to end a range.
Ranges operate in ASCII collating sequence. They can also be used for
characters specified numerically, for example [\\000-\\037]. If a range that
includes letters is used when caseless matching is set, it matches the letters
in either case. For example, [W-c] is equivalent to [][\\^_`wxyzabc], matched
caselessly, and if character tables for the "fr" locale are in use,
[\\xc8-\\xcb] matches accented E characters in both cases.
The character types \\d, \\D, \\s, \\S, \\w, and \\W may also appear in a
character class, and add the characters that they match to the class. For
example, [\\dABCDEF] matches any hexadecimal digit. A circumflex can
conveniently be used with the upper case character types to specify a more
restricted set of characters than the matching lower case type. For example,
the class [^\\W_] matches any letter or digit, but not underscore.
All non-alphameric characters other than \\, -, ^ (at the start) and the
terminating ] are non-special in character classes, but it does no harm if they
are escaped.
.SH POSIX CHARACTER CLASSES
Perl 5.6 (not yet released at the time of writing) is going to support the
POSIX notation for character classes, which uses names enclosed by [: and :]
within the enclosing square brackets. PCRE supports this notation. For example,
[01[:alpha:]%]
matches "0", "1", any alphabetic character, or "%". The supported class names
are
alnum letters and digits
alpha letters
ascii character codes 0 - 127
cntrl control characters
digit decimal digits (same as \\d)
graph printing characters, excluding space
lower lower case letters
print printing characters, including space
punct printing characters, excluding letters and digits
space white space (same as \\s)
upper upper case letters
word "word" characters (same as \\w)
xdigit hexadecimal digits
The names "ascii" and "word" are Perl extensions. Another Perl extension is
negation, which is indicated by a ^ character after the colon. For example,
[12[:^digit:]]
matches "1", "2", or any non-digit. PCRE (and Perl) also recogize the POSIX
syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
supported, and an error is given if they are encountered.
.SH VERTICAL BAR
Vertical bar characters are used to separate alternative patterns. For example,
the pattern
gilbert|sullivan
matches either "gilbert" or "sullivan". Any number of alternatives may appear,
and an empty alternative is permitted (matching the empty string).
The matching process tries each alternative in turn, from left to right,
and the first one that succeeds is used. If the alternatives are within a
subpattern (defined below), "succeeds" means matching the rest of the main
pattern as well as the alternative in the subpattern.
.SH INTERNAL OPTION SETTING
The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and PCRE_EXTENDED
can be changed from within the pattern by a sequence of Perl option letters
enclosed between "(?" and ")". The option letters are
i for PCRE_CASELESS
m for PCRE_MULTILINE
s for PCRE_DOTALL
x for PCRE_EXTENDED
For example, (?im) sets caseless, multiline matching. It is also possible to
unset these options by preceding the letter with a hyphen, and a combined
setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
permitted. If a letter appears both before and after the hyphen, the option is
unset.
The scope of these option changes depends on where in the pattern the setting
occurs. For settings that are outside any subpattern (defined below), the
effect is the same as if the options were set or unset at the start of
matching. The following patterns all behave in exactly the same way:
(?i)abc
a(?i)bc
ab(?i)c
abc(?i)
which in turn is the same as compiling the pattern abc with PCRE_CASELESS set.
In other words, such "top level" settings apply to the whole pattern (unless
there are other changes inside subpatterns). If there is more than one setting
of the same option at top level, the rightmost setting is used.
If an option change occurs inside a subpattern, the effect is different. This
is a change of behaviour in Perl 5.005. An option change inside a subpattern
affects only that part of the subpattern that follows it, so
(a(?i)b)c
matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
By this means, options can be made to have different settings in different
parts of the pattern. Any changes made in one alternative do carry on
into subsequent branches within the same subpattern. For example,
(a(?i)b|c)
matches "ab", "aB", "c", and "C", even though when matching "C" the first
branch is abandoned before the option setting. This is because the effects of
option settings happen at compile time. There would be some very weird
behaviour otherwise.
The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can be changed in the
same way as the Perl-compatible options by using the characters U and X
respectively. The (?X) flag setting is special in that it must always occur
earlier in the pattern than any of the additional features it turns on, even
when it is at top level. It is best put at the start.
.SH SUBPATTERNS
Subpatterns are delimited by parentheses (round brackets), which can be nested.
Marking part of a pattern as a subpattern does two things:
1. It localizes a set of alternatives. For example, the pattern
cat(aract|erpillar|)
matches one of the words "cat", "cataract", or "caterpillar". Without the
parentheses, it would match "cataract", "erpillar" or the empty string.
2. It sets up the subpattern as a capturing subpattern (as defined above).
When the whole pattern matches, that portion of the subject string that matched
the subpattern is passed back to the caller via the \fIovector\fR argument of
\fBpcre_exec()\fR. Opening parentheses are counted from left to right (starting
from 1) to obtain the numbers of the capturing subpatterns.
For example, if the string "the red king" is matched against the pattern
the ((red|white) (king|queen))
the captured substrings are "red king", "red", and "king", and are numbered 1,
2, and 3.
The fact that plain parentheses fulfil two functions is not always helpful.
There are often times when a grouping subpattern is required without a
capturing requirement. If an opening parenthesis is followed by "?:", the
subpattern does not do any capturing, and is not counted when computing the
number of any subsequent capturing subpatterns. For example, if the string "the
white queen" is matched against the pattern
the ((?:red|white) (king|queen))
the captured substrings are "white queen" and "queen", and are numbered 1 and
2. The maximum number of captured substrings is 99, and the maximum number of
all subpatterns, both capturing and non-capturing, is 200.
As a convenient shorthand, if any option settings are required at the start of
a non-capturing subpattern, the option letters may appear between the "?" and
the ":". Thus the two patterns
(?i:saturday|sunday)
(?:(?i)saturday|sunday)
match exactly the same set of strings. Because alternative branches are tried
from left to right, and options are not reset until the end of the subpattern
is reached, an option setting in one branch does affect subsequent branches, so
the above patterns match "SUNDAY" as well as "Saturday".
.SH REPETITION
Repetition is specified by quantifiers, which can follow any of the following
items:
a single character, possibly escaped
the . metacharacter
a character class
a back reference (see next section)
a parenthesized subpattern (unless it is an assertion - see below)
The general repetition quantifier specifies a minimum and maximum number of
permitted matches, by giving the two numbers in curly brackets (braces),
separated by a comma. The numbers must be less than 65536, and the first must
be less than or equal to the second. For example:
z{2,4}
matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
character. If the second number is omitted, but the comma is present, there is
no upper limit; if the second number and the comma are both omitted, the
quantifier specifies an exact number of required matches. Thus
[aeiou]{3,}
matches at least 3 successive vowels, but may match many more, while
\\d{8}
matches exactly 8 digits. An opening curly bracket that appears in a position
where a quantifier is not allowed, or one that does not match the syntax of a
quantifier, is taken as a literal character. For example, {,6} is not a
quantifier, but a literal string of four characters.
The quantifier {0} is permitted, causing the expression to behave as if the
previous item and the quantifier were not present.
For convenience (and historical compatibility) the three most common
quantifiers have single-character abbreviations:
* is equivalent to {0,}
+ is equivalent to {1,}
? is equivalent to {0,1}
It is possible to construct infinite loops by following a subpattern that can
match no characters with a quantifier that has no upper limit, for example:
(a?)*
Earlier versions of Perl and PCRE used to give an error at compile time for
such patterns. However, because there are cases where this can be useful, such
patterns are now accepted, but if any repetition of the subpattern does in fact
match no characters, the loop is forcibly broken.
By default, the quantifiers are "greedy", that is, they match as much as
possible (up to the maximum number of permitted times), without causing the
rest of the pattern to fail. The classic example of where this gives problems
is in trying to match comments in C programs. These appear between the
sequences /* and */ and within the sequence, individual * and / characters may
appear. An attempt to match C comments by applying the pattern
/\\*.*\\*/
to the string
/* first command */ not comment /* second comment */
fails, because it matches the entire string owing to the greediness of the .*
item.
However, if a quantifier is followed by a question mark, it ceases to be
greedy, and instead matches the minimum number of times possible, so the
pattern
/\\*.*?\\*/
does the right thing with the C comments. The meaning of the various
quantifiers is not otherwise changed, just the preferred number of matches.
Do not confuse this use of question mark with its use as a quantifier in its
own right. Because it has two uses, it can sometimes appear doubled, as in
\\d??\\d
which matches one digit by preference, but can match two if that is the only
way the rest of the pattern matches.
If the PCRE_UNGREEDY option is set (an option which is not available in Perl),
the quantifiers are not greedy by default, but individual ones can be made
greedy by following them with a question mark. In other words, it inverts the
default behaviour.
When a parenthesized subpattern is quantified with a minimum repeat count that
is greater than 1 or with a limited maximum, more store is required for the
compiled pattern, in proportion to the size of the minimum or maximum.
If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
to Perl's /s) is set, thus allowing the . to match newlines, the pattern is
implicitly anchored, because whatever follows will be tried against every
character position in the subject string, so there is no point in retrying the
overall match at any position after the first. PCRE treats such a pattern as
though it were preceded by \\A. In cases where it is known that the subject
string contains no newlines, it is worth setting PCRE_DOTALL when the pattern
begins with .* in order to obtain this optimization, or alternatively using ^
to indicate anchoring explicitly.
When a capturing subpattern is repeated, the value captured is the substring
that matched the final iteration. For example, after
(tweedle[dume]{3}\\s*)+
has matched "tweedledum tweedledee" the value of the captured substring is
"tweedledee". However, if there are nested capturing subpatterns, the
corresponding captured values may have been set in previous iterations. For
example, after
/(a|(b))+/
matches "aba" the value of the second captured substring is "b".
.SH BACK REFERENCES
Outside a character class, a backslash followed by a digit greater than 0 (and
possibly further digits) is a back reference to a capturing subpattern earlier
(i.e. to its left) in the pattern, provided there have been that many previous
capturing left parentheses.
However, if the decimal number following the backslash is less than 10, it is
always taken as a back reference, and causes an error only if there are not
that many capturing left parentheses in the entire pattern. In other words, the
parentheses that are referenced need not be to the left of the reference for
numbers less than 10. See the section entitled "Backslash" above for further
details of the handling of digits following a backslash.
A back reference matches whatever actually matched the capturing subpattern in
the current subject string, rather than anything matching the subpattern
itself. So the pattern
(sens|respons)e and \\1ibility
matches "sense and sensibility" and "response and responsibility", but not
"sense and responsibility". If caseful matching is in force at the time of the
back reference, the case of letters is relevant. For example,
((?i)rah)\\s+\\1
matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
capturing subpattern is matched caselessly.
There may be more than one back reference to the same subpattern. If a
subpattern has not actually been used in a particular match, any back
references to it always fail. For example, the pattern
(a|(bc))\\2
always fails if it starts to match "a" rather than "bc". Because there may be
up to 99 back references, all digits following the backslash are taken
as part of a potential back reference number. If the pattern continues with a
digit character, some delimiter must be used to terminate the back reference.
If the PCRE_EXTENDED option is set, this can be whitespace. Otherwise an empty
comment can be used.
A back reference that occurs inside the parentheses to which it refers fails
when the subpattern is first used, so, for example, (a\\1) never matches.
However, such references can be useful inside repeated subpatterns. For
example, the pattern
(a|b\\1)+
matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
the subpattern, the back reference matches the character string corresponding
to the previous iteration. In order for this to work, the pattern must be such
that the first iteration does not need to match the back reference. This can be
done using alternation, as in the example above, or by a quantifier with a
minimum of zero.
.SH ASSERTIONS
An assertion is a test on the characters following or preceding the current
matching point that does not actually consume any characters. The simple
assertions coded as \\b, \\B, \\A, \\Z, \\z, ^ and $ are described above. More
complicated assertions are coded as subpatterns. There are two kinds: those
that look ahead of the current position in the subject string, and those that
look behind it.
An assertion subpattern is matched in the normal way, except that it does not
cause the current matching position to be changed. Lookahead assertions start
with (?= for positive assertions and (?! for negative assertions. For example,
\\w+(?=;)
matches a word followed by a semicolon, but does not include the semicolon in
the match, and
foo(?!bar)
matches any occurrence of "foo" that is not followed by "bar". Note that the
apparently similar pattern
(?!foo)bar
does not find an occurrence of "bar" that is preceded by something other than
"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
(?!foo) is always true when the next three characters are "bar". A
lookbehind assertion is needed to achieve this effect.
Lookbehind assertions start with (?<= for positive assertions and (? as in this example:
(?>\\d+)bar
This kind of parenthesis "locks up" the part of the pattern it contains once
it has matched, and a failure further into the pattern is prevented from
backtracking into it. Backtracking past it to previous items, however, works as
normal.
An alternative description is that a subpattern of this type matches the string
of characters that an identical standalone pattern would match, if anchored at
the current point in the subject string.
Once-only subpatterns are not capturing subpatterns. Simple cases such as the
above example can be thought of as a maximizing repeat that must swallow
everything it can. So, while both \\d+ and \\d+? are prepared to adjust the
number of digits they match in order to make the rest of the pattern match,
(?>\\d+) can only match an entire sequence of digits.
This construction can of course contain arbitrarily complicated subpatterns,
and it can be nested.
Once-only subpatterns can be used in conjunction with lookbehind assertions to
specify efficient matching at the end of the subject string. Consider a simple
pattern such as
abcd$
when applied to a long string which does not match. Because matching proceeds
from left to right, PCRE will look for each "a" in the subject and then see if
what follows matches the rest of the pattern. If the pattern is specified as
^.*abcd$
the initial .* matches the entire string at first, but when this fails (because
there is no following "a"), it backtracks to match all but the last character,
then all but the last two characters, and so on. Once again the search for "a"
covers the entire string, from right to left, so we are no better off. However,
if the pattern is written as
^(?>.*)(?<=abcd)
there can be no backtracking for the .* item; it can match only the entire
string. The subsequent lookbehind assertion does a single test on the last four
characters. If it fails, the match fails immediately. For long strings, this
approach makes a significant difference to the processing time.
When a pattern contains an unlimited repeat inside a subpattern that can itself
be repeated an unlimited number of times, the use of a once-only subpattern is
the only way to avoid some failing matches taking a very long time indeed.
The pattern
(\\D+|<\\d+>)*[!?]
matches an unlimited number of substrings that either consist of non-digits, or
digits enclosed in <>, followed by either ! or ?. When it matches, it runs
quickly. However, if it is applied to
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
it takes a long time before reporting failure. This is because the string can
be divided between the two repeats in a large number of ways, and all have to
be tried. (The example used [!?] rather than a single character at the end,
because both PCRE and Perl have an optimization that allows for fast failure
when a single character is used. They remember the last single character that
is required for a match, and fail early if it is not present in the string.)
If the pattern is changed to
((?>\\D+)|<\\d+>)*[!?]
sequences of non-digits cannot be broken, and failure happens quickly.
.SH CONDITIONAL SUBPATTERNS
It is possible to cause the matching process to obey a subpattern
conditionally or to choose between two alternative subpatterns, depending on
the result of an assertion, or whether a previous capturing subpattern matched
or not. The two possible forms of conditional subpattern are
(?(condition)yes-pattern)
(?(condition)yes-pattern|no-pattern)
If the condition is satisfied, the yes-pattern is used; otherwise the
no-pattern (if present) is used. If there are more than two alternatives in the
subpattern, a compile-time error occurs.
There are two kinds of condition. If the text between the parentheses consists
of a sequence of digits, the condition is satisfied if the capturing subpattern
of that number has previously matched. The number must be greater than zero.
Consider the following pattern, which contains non-significant white space to
make it more readable (assume the PCRE_EXTENDED option) and to divide it into
three parts for ease of discussion:
( \\( )? [^()]+ (?(1) \\) )
The first part matches an optional opening parenthesis, and if that
character is present, sets it as the first captured substring. The second part
matches one or more characters that are not parentheses. The third part is a
conditional subpattern that tests whether the first set of parentheses matched
or not. If they did, that is, if subject started with an opening parenthesis,
the condition is true, and so the yes-pattern is executed and a closing
parenthesis is required. Otherwise, since no-pattern is not present, the
subpattern matches nothing. In other words, this pattern matches a sequence of
non-parentheses, optionally enclosed in parentheses.
If the condition is not a sequence of digits, it must be an assertion. This may
be a positive or negative lookahead or lookbehind assertion. Consider this
pattern, again containing non-significant white space, and with the two
alternatives on the second line:
(?(?=[^a-z]*[a-z])
\\d{2}-[a-z]{3}-\\d{2} | \\d{2}-\\d{2}-\\d{2} )
The condition is a positive lookahead assertion that matches an optional
sequence of non-letters followed by a letter. In other words, it tests for the
presence of at least one letter in the subject. If a letter is found, the
subject is matched against the first alternative; otherwise it is matched
against the second. This pattern matches strings in one of the two forms
dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
.SH COMMENTS
The sequence (?# marks the start of a comment which continues up to the next
closing parenthesis. Nested parentheses are not permitted. The characters
that make up a comment play no part in the pattern matching at all.
If the PCRE_EXTENDED option is set, an unescaped # character outside a
character class introduces a comment that continues up to the next newline
character in the pattern.
.SH RECURSIVE PATTERNS
Consider the problem of matching a string in parentheses, allowing for
unlimited nested parentheses. Without the use of recursion, the best that can
be done is to use a pattern that matches up to some fixed depth of nesting. It
is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
experimental facility that allows regular expressions to recurse (amongst other
things). It does this by interpolating Perl code in the expression at run time,
and the code can refer to the expression itself. A Perl pattern to solve the
parentheses problem can be created like this:
$re = qr{\\( (?: (?>[^()]+) | (?p{$re}) )* \\)}x;
The (?p{...}) item interpolates Perl code at run time, and in this case refers
recursively to the pattern in which it appears. Obviously, PCRE cannot support
the interpolation of Perl code. Instead, the special item (?R) is provided for
the specific case of recursion. This PCRE pattern solves the parentheses
problem (assume the PCRE_EXTENDED option is set so that white space is
ignored):
\\( ( (?>[^()]+) | (?R) )* \\)
First it matches an opening parenthesis. Then it matches any number of
substrings which can either be a sequence of non-parentheses, or a recursive
match of the pattern itself (i.e. a correctly parenthesized substring). Finally
there is a closing parenthesis.
This particular example pattern contains nested unlimited repeats, and so the
use of a once-only subpattern for matching strings of non-parentheses is
important when applying the pattern to strings that do not match. For example,
when it is applied to
(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
it yields "no match" quickly. However, if a once-only subpattern is not used,
the match runs for a very long time indeed because there are so many different
ways the + and * repeats can carve up the subject, and all have to be tested
before failure can be reported.
The values set for any capturing subpatterns are those from the outermost level
of the recursion at which the subpattern value is set. If the pattern above is
matched against
(ab(cd)ef)
the value for the capturing parentheses is "ef", which is the last value taken
on at the top level. If additional parentheses are added, giving
\\( ( ( (?>[^()]+) | (?R) )* ) \\)
^ ^
^ ^
the string they capture is "ab(cd)ef", the contents of the top level
parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
has to obtain extra memory to store data during a recursion, which it does by
using \fBpcre_malloc\fR, freeing it via \fBpcre_free\fR afterwards. If no
memory can be obtained, it saves data for the first 15 capturing parentheses
only, as there is no way to give an out-of-memory error from within a
recursion.
.SH PERFORMANCE
Certain items that may appear in patterns are more efficient than others. It is
more efficient to use a character class like [aeiou] than a set of alternatives
such as (a|e|i|o|u). In general, the simplest construction that provides the
required behaviour is usually the most efficient. Jeffrey Friedl's book
contains a lot of discussion about optimizing regular expressions for efficient
performance.
When a pattern begins with .* and the PCRE_DOTALL option is set, the pattern is
implicitly anchored by PCRE, since it can match only at the start of a subject
string. However, if PCRE_DOTALL is not set, PCRE cannot make this optimization,
because the . metacharacter does not then match a newline, and if the subject
string contains newlines, the pattern may match from the character immediately
following one of them instead of from the very start. For example, the pattern
(.*) second
matches the subject "first\\nand second" (where \\n stands for a newline
character) with the first captured substring being "and". In order to do this,
PCRE has to retry the match starting after every newline in the subject.
If you are using such a pattern with subject strings that do not contain
newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
the pattern with ^.* to indicate explicit anchoring. That saves PCRE from
having to scan along the subject looking for a newline to restart at.
Beware of patterns that contain nested indefinite repeats. These can take a
long time to run when applied to a string that does not match. Consider the
pattern fragment
(a+)*
This can match "aaaa" in 33 different ways, and this number increases very
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
times, and for each of those cases other than 0, the + repeats can match
different numbers of times.) When the remainder of the pattern is such that the
entire match is going to fail, PCRE has in principle to try every possible
variation, and this can take an extremely long time.
An optimization catches some of the more simple cases such as
(a+)*b
where a literal character follows. Before embarking on the standard matching
procedure, PCRE checks that there is a "b" later in the subject string, and if
there is not, it fails the match immediately. However, when there is no
following literal this optimization cannot be used. You can see the difference
by comparing the behaviour of
(a+)*\\d
with the pattern above. The former gives a failure almost instantly when
applied to a whole line of "a" characters, whereas the latter takes an
appreciable time with strings longer than about 20 characters.
.SH UTF-8 SUPPORT
Starting at release 3.3, PCRE has some support for character strings encoded
in the UTF-8 format. This is incomplete, and is regarded as experimental. In
order to use it, you must configure PCRE to include UTF-8 support in the code,
and, in addition, you must call \fBpcre_compile()\fR with the PCRE_UTF8 option
flag. When you do this, both the pattern and any subject strings that are
matched against it are treated as UTF-8 strings instead of just strings of
bytes, but only in the cases that are mentioned below.
If you compile PCRE with UTF-8 support, but do not use it at run time, the
library will be a bit bigger, but the additional run time overhead is limited
to testing the PCRE_UTF8 flag in several places, so should not be very large.
PCRE assumes that the strings it is given contain valid UTF-8 codes. It does
not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE,
the results are undefined.
Running with PCRE_UTF8 set causes these changes in the way PCRE works:
1. In a pattern, the escape sequence \\x{...}, where the contents of the braces
is a string of hexadecimal digits, is interpreted as a UTF-8 character whose
code number is the given hexadecimal number, for example: \\x{1234}. This
inserts from one to six literal bytes into the pattern, using the UTF-8
encoding. If a non-hexadecimal digit appears between the braces, the item is
not recognized.
2. The original hexadecimal escape sequence, \\xhh, generates a two-byte UTF-8
character if its value is greater than 127.
3. Repeat quantifiers are NOT correctly handled if they follow a multibyte
character. For example, \\x{100}* and \\xc3+ do not work. If you want to
repeat such characters, you must enclose them in non-capturing parentheses,
for example (?:\\x{100}), at present.
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
5. Unlike literal UTF-8 characters, the dot metacharacter followed by a
repeat quantifier does operate correctly on UTF-8 characters instead of
single bytes.
4. Although the \\x{...} escape is permitted in a character class, characters
whose values are greater than 255 cannot be included in a class.
5. A class is matched against a UTF-8 character instead of just a single byte,
but it can match only characters whose values are less than 256. Characters
with greater values always fail to match a class.
6. Repeated classes work correctly on multiple characters.
7. Classes containing just a single character whose value is greater than 127
(but less than 256), for example, [\\x80] or [^\\x{93}], do not work because
these are optimized into single byte matches. In the first case, of course,
the class brackets are just redundant.
8. Lookbehind assertions move backwards in the subject by a fixed number of
characters instead of a fixed number of bytes. Simple cases have been tested
to work correctly, but there may be hidden gotchas herein.
9. The character types such as \\d and \\w do not work correctly with UTF-8
characters. They continue to test a single byte.
10. Anything not explicitly mentioned here continues to work in bytes rather
than in characters.
The following UTF-8 features of Perl 5.6 are not implemented:
1. The escape sequence \\C to match a single byte.
2. The use of Unicode tables and properties and escapes \\p, \\P, and \\X.
.SH AUTHOR
Philip Hazel
.br
University Computing Service,
.br
New Museums Site,
.br
Cambridge CB2 3QG, England.
.br
Phone: +44 1223 334714
Last updated: 28 August 2000,
.br
the 250th anniversary of the death of J.S. Bach.
.br
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/pcreposix.html 000640 001751 001751 00000016271 10546014100 020360 0 ustar 00fk fk 000000 000000
pcreposix specification
pcreposix specification
This HTML document has been generated automatically from the original man page.
If there is any nonsense in it, please consult the man page in case the
conversion went wrong.
This set of functions provides a POSIX-style API to the PCRE regular expression
package. See the pcre documentation for a description of the native API,
which contains additional functionality.
The functions described here are just wrapper functions that ultimately call
the native API. Their prototypes are defined in the pcreposix.h header
file, and on Unix systems the library itself is called pcreposix.a, so
can be accessed by adding -lpcreposix to the command for linking an
application which uses them. Because the POSIX functions call the native ones,
it is also necessary to add \fR-lpcre\fR.
I have implemented only those option bits that can be reasonably mapped to PCRE
native options. In addition, the options REG_EXTENDED and REG_NOSUB are defined
with the value zero. They have no effect, but since programs that are written
to the POSIX interface often use them, this makes it easier to slot in PCRE as
a replacement library. Other POSIX options are not even defined.
When PCRE is called via these functions, it is only the API that is POSIX-like
in style. The syntax and semantics of the regular expressions themselves are
still those of Perl, subject to the setting of various PCRE options, as
described below.
The header for these functions is supplied as pcreposix.h to avoid any
potential clash with other POSIX libraries. It can, of course, be renamed or
aliased as regex.h, which is the "correct" name. It provides two
structure types, regex_t for compiled internal forms, and
regmatch_t for returning captured substrings. It also defines some
constants whose names start with "REG_"; these are used for setting options and
identifying error codes.
The function regcomp() is called to compile a pattern into an
internal form. The pattern is a C string terminated by a binary zero, and
is passed in the argument pattern. The preg argument is a pointer
to a regex_t structure which is used as a base for storing information about
the compiled expression.
The argument cflags is either zero, or contains one or more of the bits
defined by the following macros:
REG_ICASE
The PCRE_CASELESS option is set when the expression is passed for compilation
to the native function.
REG_NEWLINE
The PCRE_MULTILINE option is set when the expression is passed for compilation
to the native function.
In the absence of these flags, no options are passed to the native function.
This means the the regex is compiled with PCRE default semantics. In
particular, the way it handles newline characters in the subject string is the
Perl way, not the POSIX way. Note that setting PCRE_MULTILINE has only
some of the effects specified for REG_NEWLINE. It does not affect the way
newlines are matched by . (they aren't) or a negative class such as [^a] (they
are).
The yield of regcomp() is zero on success, and non-zero otherwise. The
preg structure is filled in on success, and one member of the structure
is publicized: re_nsub contains the number of capturing subpatterns in
the regular expression. Various error codes are defined in the header file.
The function regexec() is called to match a pre-compiled pattern
preg against a given string, which is terminated by a zero byte,
subject to the options in eflags. These can be:
REG_NOTBOL
The PCRE_NOTBOL option is set when calling the underlying PCRE matching
function.
REG_NOTEOL
The PCRE_NOTEOL option is set when calling the underlying PCRE matching
function.
The portion of the string that was matched, and also any captured substrings,
are returned via the pmatch argument, which points to an array of
nmatch structures of type regmatch_t, containing the members
rm_so and rm_eo. These contain the offset to the first character of
each substring and the offset to the first character after the end of each
substring, respectively. The 0th element of the vector relates to the entire
portion of string that was matched; subsequent elements relate to the
capturing subpatterns of the regular expression. Unused entries in the array
have both structure members set to -1.
A successful match yields a zero return; various error codes are defined in the
header file, of which REG_NOMATCH is the "expected" failure code.
The regerror() function maps a non-zero errorcode from either
regcomp or regexec to a printable message. If preg is not
NULL, the error should have arisen from the use of that structure. A message
terminated by a binary zero is placed in errbuf. The length of the
message, including the zero, is limited to errbuf_size. The yield of the
function is the size of buffer needed to hold the whole message.
Compiling a regular expression causes memory to be allocated and associated
with the preg structure. The function regfree() frees all such
memory, after which preg may no longer be used as a compiled expression.
Philip Hazel <ph10@cam.ac.uk>
University Computing Service,
New Museums Site,
Cambridge CB2 3QG, England.
Phone: +44 1223 334714
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/pcre.html 000640 001751 001751 00000267174 10546014100 017307 0 ustar 00fk fk 000000 000000
pcre specification
pcre specification
This HTML document has been generated automatically from the original man page.
If there is any nonsense in it, please consult the man page in case the
conversion went wrong.
The PCRE library is a set of functions that implement regular expression
pattern matching using the same syntax and semantics as Perl 5, with just a few
differences (see below). The current implementation corresponds to Perl 5.005,
with some additional features from later versions. This includes some
experimental, incomplete support for UTF-8 encoded strings. Details of exactly
what is and what is not supported are given below.
PCRE has its own native API, which is described in this document. There is also
a set of wrapper functions that correspond to the POSIX regular expression API.
These are described in the pcreposix documentation.
The native API function prototypes are defined in the header file pcre.h,
and on Unix systems the library itself is called libpcre.a, so can be
accessed by adding -lpcre to the command for linking an application which
calls it. The header file defines the macros PCRE_MAJOR and PCRE_MINOR to
contain the major and minor release numbers for the library. Applications can
use these to include support for different releases.
The functions pcre_compile(), pcre_study(), and pcre_exec()
are used for compiling and matching regular expressions.
The functions pcre_copy_substring(), pcre_get_substring(), and
pcre_get_substring_list() are convenience functions for extracting
captured substrings from a matched subject string; pcre_free_substring()
and pcre_free_substring_list() are also provided, to free the memory used
for extracted strings.
The function pcre_maketables() is used (optionally) to build a set of
character tables in the current locale for passing to pcre_compile().
The function pcre_fullinfo() is used to find out information about a
compiled pattern; pcre_info() is an obsolete version which returns only
some of the available information, but is retained for backwards compatibility.
The function pcre_version() returns a pointer to a string containing the
version of PCRE and its date of release.
The global variables pcre_malloc and pcre_free initially contain
the entry points of the standard malloc() and free() functions
respectively. PCRE calls the memory management functions via these variables,
so a calling program can replace them if it wishes to intercept the calls. This
should be done before calling any PCRE functions.
The PCRE functions can be used in multi-threading applications, with the
proviso that the memory management functions pointed to by pcre_malloc
and pcre_free are shared by all threads.
The compiled form of a regular expression is not altered during matching, so
the same compiled pattern can safely be used by several threads at once.
The function pcre_compile() is called to compile a pattern into an
internal form. The pattern is a C string terminated by a binary zero, and
is passed in the argument pattern. A pointer to a single block of memory
that is obtained via pcre_malloc is returned. This contains the
compiled code and related data. The pcre type is defined for this for
convenience, but in fact pcre is just a typedef for void, since the
contents of the block are not externally defined. It is up to the caller to
free the memory when it is no longer required.
The size of a compiled pattern is roughly proportional to the length of the
pattern string, except that each character class (other than those containing
just a single character, negated or not) requires 33 bytes, and repeat
quantifiers with a minimum greater than one or a bounded maximum cause the
relevant portions of the compiled pattern to be replicated.
The options argument contains independent bits that affect the
compilation. It should be zero if no options are required. Some of the options,
in particular, those that are compatible with Perl, can also be set and unset
from within the pattern (see the detailed description of regular expressions
below). For these options, the contents of the options argument specifies
their initial settings at the start of compilation and execution. The
PCRE_ANCHORED option can be set at the time of matching as well as at compile
time.
If errptr is NULL, pcre_compile() returns NULL immediately.
Otherwise, if compilation of a pattern fails, pcre_compile() returns
NULL, and sets the variable pointed to by errptr to point to a textual
error message. The offset from the start of the pattern to the character where
the error was discovered is placed in the variable pointed to by
erroffset, which must not be NULL. If it is, an immediate error is given.
If the final argument, tableptr, is NULL, PCRE uses a default set of
character tables which are built when it is compiled, using the default C
locale. Otherwise, tableptr must be the result of a call to
pcre_maketables(). See the section on locale support below.
The following option bits are defined in the header file:
PCRE_ANCHORED
If this bit is set, the pattern is forced to be "anchored", that is, it is
constrained to match only at the start of the string which is being searched
(the "subject string"). This effect can also be achieved by appropriate
constructs in the pattern itself, which is the only way to do it in Perl.
PCRE_CASELESS
If this bit is set, letters in the pattern match both upper and lower case
letters. It is equivalent to Perl's /i option.
PCRE_DOLLAR_ENDONLY
If this bit is set, a dollar metacharacter in the pattern matches only at the
end of the subject string. Without this option, a dollar also matches
immediately before the final character if it is a newline (but not before any
other newlines). The PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
set. There is no equivalent to this option in Perl.
PCRE_DOTALL
If this bit is set, a dot metacharater in the pattern matches all characters,
including newlines. Without it, newlines are excluded. This option is
equivalent to Perl's /s option. A negative class such as [^a] always matches a
newline character, independent of the setting of this option.
PCRE_EXTENDED
If this bit is set, whitespace data characters in the pattern are totally
ignored except when escaped or inside a character class, and characters between
an unescaped # outside a character class and the next newline character,
inclusive, are also ignored. This is equivalent to Perl's /x option, and makes
it possible to include comments inside complicated patterns. Note, however,
that this applies only to data characters. Whitespace characters may never
appear within special character sequences in a pattern, for example within the
sequence (?( which introduces a conditional subpattern.
PCRE_EXTRA
This option was invented in order to turn on additional functionality of PCRE
that is incompatible with Perl, but it is currently of very little use. When
set, any backslash in a pattern that is followed by a letter that has no
special meaning causes an error, thus reserving these combinations for future
expansion. By default, as in Perl, a backslash followed by a letter with no
special meaning is treated as a literal. There are at present no other features
controlled by this option. It can also be set by a (?X) option setting within a
pattern.
PCRE_MULTILINE
By default, PCRE treats the subject string as consisting of a single "line" of
characters (even if it actually contains several newlines). The "start of line"
metacharacter (^) matches only at the start of the string, while the "end of
line" metacharacter ($) matches only at the end of the string, or before a
terminating newline (unless PCRE_DOLLAR_ENDONLY is set). This is the same as
Perl.
When PCRE_MULTILINE it is set, the "start of line" and "end of line" constructs
match immediately following or immediately before any newline in the subject
string, respectively, as well as at the very start and end. This is equivalent
to Perl's /m option. If there are no "\n" characters in a subject string, or
no occurrences of ^ or $ in a pattern, setting PCRE_MULTILINE has no
effect.
PCRE_UNGREEDY
This option inverts the "greediness" of the quantifiers so that they are not
greedy by default, but become greedy if followed by "?". It is not compatible
with Perl. It can also be set by a (?U) option setting within the pattern.
PCRE_UTF8
This option causes PCRE to regard both the pattern and the subject as strings
of UTF-8 characters instead of just byte strings. However, it is available only
if PCRE has been built to include UTF-8 support. If not, the use of this option
provokes an error. Support for UTF-8 is new, experimental, and incomplete.
Details of exactly what it entails are given below.
When a pattern is going to be used several times, it is worth spending more
time analyzing it in order to speed up the time taken for matching. The
function pcre_study() takes a pointer to a compiled pattern as its first
argument, and returns a pointer to a pcre_extra block (another void
typedef) containing additional information about the pattern; this can be
passed to pcre_exec(). If no additional information is available, NULL
is returned.
The second argument contains option bits. At present, no options are defined
for pcre_study(), and this argument should always be zero.
The third argument for pcre_study() is a pointer to an error message. If
studying succeeds (even if no data is returned), the variable it points to is
set to NULL. Otherwise it points to a textual error message.
At present, studying a pattern is useful only for non-anchored patterns that do
not have a single fixed starting character. A bitmap of possible starting
characters is created.
PCRE handles caseless matching, and determines whether characters are letters,
digits, or whatever, by reference to a set of tables. The library contains a
default set of tables which is created in the default C locale when PCRE is
compiled. This is used when the final argument of pcre_compile() is NULL,
and is sufficient for many applications.
An alternative set of tables can, however, be supplied. Such tables are built
by calling the pcre_maketables() function, which has no arguments, in the
relevant locale. The result can then be passed to pcre_compile() as often
as necessary. For example, to build and use tables that are appropriate for the
French locale (where accented characters with codes greater than 128 are
treated as letters), the following code could be used:
setlocale(LC_CTYPE, "fr");
tables = pcre_maketables();
re = pcre_compile(..., tables);
The tables are built in memory that is obtained via pcre_malloc. The
pointer that is passed to pcre_compile is saved with the compiled
pattern, and the same tables are used via this pointer by pcre_study()
and pcre_exec(). Thus for any single pattern, compilation, studying and
matching all happen in the same locale, but different patterns can be compiled
in different locales. It is the caller's responsibility to ensure that the
memory containing the tables remains available for as long as it is needed.
The pcre_fullinfo() function returns information about a compiled
pattern. It replaces the obsolete pcre_info() function, which is
nevertheless retained for backwards compability (and is documented below).
The first argument for pcre_fullinfo() is a pointer to the compiled
pattern. The second argument is the result of pcre_study(), or NULL if
the pattern was not studied. The third argument specifies which piece of
information is required, while the fourth argument is a pointer to a variable
to receive the data. The yield of the function is zero for success, or one of
the following negative numbers:
PCRE_ERROR_NULL the argument code was NULL
the argument where was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of what was invalid
The possible values for the third argument are defined in pcre.h, and are
as follows:
PCRE_INFO_OPTIONS
Return a copy of the options with which the pattern was compiled. The fourth
argument should point to au unsigned long int variable. These option bits
are those specified in the call to pcre_compile(), modified by any
top-level option settings within the pattern itself, and with the PCRE_ANCHORED
bit forcibly set if the form of the pattern implies that it can match only at
the start of a subject string.
PCRE_INFO_SIZE
Return the size of the compiled pattern, that is, the value that was passed as
the argument to pcre_malloc() when PCRE was getting memory in which to
place the compiled data. The fourth argument should point to a size_t
variable.
PCRE_INFO_CAPTURECOUNT
Return the number of capturing subpatterns in the pattern. The fourth argument
should point to an \fbint\fR variable.
PCRE_INFO_BACKREFMAX
Return the number of the highest back reference in the pattern. The fourth
argument should point to an int variable. Zero is returned if there are
no back references.
PCRE_INFO_FIRSTCHAR
Return information about the first character of any matched string, for a
non-anchored pattern. If there is a fixed first character, e.g. from a pattern
such as (cat|cow|coyote), it is returned in the integer pointed to by
where. Otherwise, if either
(a) the pattern was compiled with the PCRE_MULTILINE option, and every branch
starts with "^", or
(b) every branch of the pattern starts with ".*" and PCRE_DOTALL is not set
(if it were set, the pattern would be anchored),
-1 is returned, indicating that the pattern matches only at the start of a
subject string or after any "\n" within the string. Otherwise -2 is returned.
For anchored patterns, -2 is returned.
PCRE_INFO_FIRSTTABLE
If the pattern was studied, and this resulted in the construction of a 256-bit
table indicating a fixed set of characters for the first character in any
matching string, a pointer to the table is returned. Otherwise NULL is
returned. The fourth argument should point to an unsigned char *
variable.
PCRE_INFO_LASTLITERAL
For a non-anchored pattern, return the value of the rightmost literal character
which must exist in any matched string, other than at its start. The fourth
argument should point to an int variable. If there is no such character,
or if the pattern is anchored, -1 is returned. For example, for the pattern
/a\d+z\d+/ the returned value is 'z'.
The pcre_info() function is now obsolete because its interface is too
restrictive to return all the available data about a compiled pattern. New
programs should use pcre_fullinfo() instead. The yield of
pcre_info() is the number of capturing subpatterns, or one of the
following negative numbers:
PCRE_ERROR_NULL the argument code was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
If the optptr argument is not NULL, a copy of the options with which the
pattern was compiled is placed in the integer it points to (see
PCRE_INFO_OPTIONS above).
If the pattern is not anchored and the firstcharptr argument is not NULL,
it is used to pass back information about the first character of any matched
string (see PCRE_INFO_FIRSTCHAR above).
The function pcre_exec() is called to match a subject string against a
pre-compiled pattern, which is passed in the code argument. If the
pattern has been studied, the result of the study should be passed in the
extra argument. Otherwise this must be NULL.
The PCRE_ANCHORED option can be passed in the options argument, whose
unused bits must be zero. However, if a pattern was compiled with
PCRE_ANCHORED, or turned out to be anchored by virtue of its contents, it
cannot be made unachored at matching time.
There are also three further options that can be set only at matching time:
PCRE_NOTBOL
The first character of the string is not the beginning of a line, so the
circumflex metacharacter should not match before it. Setting this without
PCRE_MULTILINE (at compile time) causes circumflex never to match.
PCRE_NOTEOL
The end of the string is not the end of a line, so the dollar metacharacter
should not match it nor (except in multiline mode) a newline immediately before
it. Setting this without PCRE_MULTILINE (at compile time) causes dollar never
to match.
PCRE_NOTEMPTY
An empty string is not considered to be a valid match if this option is set. If
there are alternatives in the pattern, they are tried. If all the alternatives
match the empty string, the entire match fails. For example, if the pattern
a?b?
is applied to a string not beginning with "a" or "b", it matches the empty
string at the start of the subject. With PCRE_NOTEMPTY set, this match is not
valid, so PCRE searches further into the string for occurrences of "a" or "b".
Perl has no direct equivalent of PCRE_NOTEMPTY, but it does make a special case
of a pattern match of the empty string within its split() function, and
when using the /g modifier. It is possible to emulate Perl's behaviour after
matching a null string by first trying the match again at the same offset with
PCRE_NOTEMPTY set, and then if that fails by advancing the starting offset (see
below) and trying an ordinary match again.
The subject string is passed as a pointer in subject, a length in
length, and a starting offset in startoffset. Unlike the pattern
string, it may contain binary zero characters. When the starting offset is
zero, the search for a match starts at the beginning of the subject, and this
is by far the most common case.
A non-zero starting offset is useful when searching for another match in the
same subject by calling pcre_exec() again after a previous success.
Setting startoffset differs from just passing over a shortened string and
setting PCRE_NOTBOL in the case of a pattern that begins with any kind of
lookbehind. For example, consider the pattern
\Biss\B
which finds occurrences of "iss" in the middle of words. (\B matches only if
the current position in the subject is not a word boundary.) When applied to
the string "Mississipi" the first call to pcre_exec() finds the first
occurrence. If pcre_exec() is called again with just the remainder of the
subject, namely "issipi", it does not match, because \B is always false at the
start of the subject, which is deemed to be a word boundary. However, if
pcre_exec() is passed the entire string again, but with startoffset
set to 4, it finds the second occurrence of "iss" because it is able to look
behind the starting point to discover that it is preceded by a letter.
If a non-zero starting offset is passed when the pattern is anchored, one
attempt to match at the given offset is tried. This can only succeed if the
pattern does not require the match to be at the start of the subject.
In general, a pattern matches a certain portion of the subject, and in
addition, further substrings from the subject may be picked out by parts of the
pattern. Following the usage in Jeffrey Friedl's book, this is called
"capturing" in what follows, and the phrase "capturing subpattern" is used for
a fragment of a pattern that picks out a substring. PCRE supports several other
kinds of parenthesized subpattern that do not cause substrings to be captured.
Captured substrings are returned to the caller via a vector of integer offsets
whose address is passed in ovector. The number of elements in the vector
is passed in ovecsize. The first two-thirds of the vector is used to pass
back captured substrings, each substring using a pair of integers. The
remaining third of the vector is used as workspace by pcre_exec() while
matching capturing subpatterns, and is not available for passing back
information. The length passed in ovecsize should always be a multiple of
three. If it is not, it is rounded down.
When a match has been successful, information about captured substrings is
returned in pairs of integers, starting at the beginning of ovector, and
continuing up to two-thirds of its length at the most. The first element of a
pair is set to the offset of the first character in a substring, and the second
is set to the offset of the first character after the end of a substring. The
first pair, ovector[0] and ovector[1], identify the portion of the
subject string matched by the entire pattern. The next pair is used for the
first capturing subpattern, and so on. The value returned by pcre_exec()
is the number of pairs that have been set. If there are no capturing
subpatterns, the return value from a successful match is 1, indicating that
just the first pair of offsets has been set.
Some convenience functions are provided for extracting the captured substrings
as separate strings. These are described in the following section.
It is possible for an capturing subpattern number n+1 to match some
part of the subject when subpattern n has not been used at all. For
example, if the string "abc" is matched against the pattern (a|(z))(bc)
subpatterns 1 and 3 are matched, but 2 is not. When this happens, both offset
values corresponding to the unused subpattern are set to -1.
If a capturing subpattern is matched repeatedly, it is the last portion of the
string that it matched that gets returned.
If the vector is too small to hold all the captured substrings, it is used as
far as possible (up to two-thirds of its length), and the function returns a
value of zero. In particular, if the substring offsets are not of interest,
pcre_exec() may be called with ovector passed as NULL and
ovecsize as zero. However, if the pattern contains back references and
the ovector isn't big enough to remember the related substrings, PCRE has
to get additional memory for use during matching. Thus it is usually advisable
to supply an ovector.
Note that pcre_info() can be used to find out how many capturing
subpatterns there are in a compiled pattern. The smallest size for
ovector that will allow for n captured substrings in addition to
the offsets of the substring matched by the whole pattern is (n+1)*3.
If pcre_exec() fails, it returns a negative number. The following are
defined in the header file:
PCRE_ERROR_NOMATCH (-1)
The subject string did not match the pattern.
PCRE_ERROR_NULL (-2)
Either code or subject was passed as NULL, or ovector was
NULL and ovecsize was not zero.
PCRE_ERROR_BADOPTION (-3)
An unrecognized bit was set in the options argument.
PCRE_ERROR_BADMAGIC (-4)
PCRE stores a 4-byte "magic number" at the start of the compiled code, to catch
the case when it is passed a junk pointer. This is the error it gives when the
magic number isn't present.
PCRE_ERROR_UNKNOWN_NODE (-5)
While running the pattern match, an unknown item was encountered in the
compiled pattern. This error could be caused by a bug in PCRE or by overwriting
of the compiled pattern.
PCRE_ERROR_NOMEMORY (-6)
If a pattern contains back references, but the ovector that is passed to
pcre_exec() is not big enough to remember the referenced substrings, PCRE
gets a block of memory at the start of matching to use for this purpose. If the
call via pcre_malloc() fails, this error is given. The memory is freed at
the end of matching.
Captured substrings can be accessed directly by using the offsets returned by
pcre_exec() in ovector. For convenience, the functions
pcre_copy_substring(), pcre_get_substring(), and
pcre_get_substring_list() are provided for extracting captured substrings
as new, separate, zero-terminated strings. A substring that contains a binary
zero is correctly extracted and has a further zero added on the end, but the
result does not, of course, function as a C string.
The first three arguments are the same for all three functions: subject
is the subject string which has just been successfully matched, ovector
is a pointer to the vector of integer offsets that was passed to
pcre_exec(), and stringcount is the number of substrings that
were captured by the match, including the substring that matched the entire
regular expression. This is the value returned by pcre_exec if it
is greater than zero. If pcre_exec() returned zero, indicating that it
ran out of space in ovector, the value passed as stringcount should
be the size of the vector divided by three.
The functions pcre_copy_substring() and pcre_get_substring()
extract a single substring, whose number is given as stringnumber. A
value of zero extracts the substring that matched the entire pattern, while
higher values extract the captured substrings. For pcre_copy_substring(),
the string is placed in buffer, whose length is given by
buffersize, while for pcre_get_substring() a new block of memory is
obtained via pcre_malloc, and its address is returned via
stringptr. The yield of the function is the length of the string, not
including the terminating zero, or one of
PCRE_ERROR_NOMEMORY (-6)
The buffer was too small for pcre_copy_substring(), or the attempt to get
memory failed for pcre_get_substring().
PCRE_ERROR_NOSUBSTRING (-7)
There is no substring whose number is stringnumber.
The pcre_get_substring_list() function extracts all available substrings
and builds a list of pointers to them. All this is done in a single block of
memory which is obtained via pcre_malloc. The address of the memory block
is returned via listptr, which is also the start of the list of string
pointers. The end of the list is marked by a NULL pointer. The yield of the
function is zero if all went well, or
PCRE_ERROR_NOMEMORY (-6)
if the attempt to get the memory block failed.
When any of these functions encounter a substring that is unset, which can
happen when capturing subpattern number n+1 matches some part of the
subject, but subpattern n has not been used at all, they return an empty
string. This can be distinguished from a genuine zero-length substring by
inspecting the appropriate offset in ovector, which is negative for unset
substrings.
The two convenience functions pcre_free_substring() and
pcre_free_substring_list() can be used to free the memory returned by
a previous call of pcre_get_substring() or
pcre_get_substring_list(), respectively. They do nothing more than call
the function pointed to by pcre_free, which of course could be called
directly from a C program. However, PCRE is used in some situations where it is
linked via a special interface to another programming language which cannot use
pcre_free directly; it is for these cases that the functions are
provided.
There are some size limitations in PCRE but it is hoped that they will never in
practice be relevant.
The maximum length of a compiled pattern is 65539 (sic) bytes.
All values in repeating quantifiers must be less than 65536.
The maximum number of capturing subpatterns is 99.
The maximum number of all parenthesized subpatterns, including capturing
subpatterns, assertions, and other types of subpattern, is 200.
The maximum length of a subject string is the largest positive number that an
integer variable can hold. However, PCRE uses recursion to handle subpatterns
and indefinite repetition. This means that the available stack space may limit
the size of a subject string that can be processed by certain patterns.
The differences described here are with respect to Perl 5.005.
1. By default, a whitespace character is any character that the C library
function isspace() recognizes, though it is possible to compile PCRE with
alternative character type tables. Normally isspace() matches space,
formfeed, newline, carriage return, horizontal tab, and vertical tab. Perl 5
no longer includes vertical tab in its set of whitespace characters. The \v
escape that was in the Perl documentation for a long time was never in fact
recognized. However, the character itself was treated as whitespace at least
up to 5.002. In 5.004 and 5.005 it does not match \s.
2. PCRE does not allow repeat quantifiers on lookahead assertions. Perl permits
them, but they do not mean what you might think. For example, (?!a){3} does
not assert that the next three characters are not "a". It just asserts that the
next character is not "a" three times.
3. Capturing subpatterns that occur inside negative lookahead assertions are
counted, but their entries in the offsets vector are never set. Perl sets its
numerical variables from any such patterns that are matched before the
assertion fails to match something (thereby succeeding), but only if the
negative lookahead assertion contains just one branch.
4. Though binary zero characters are supported in the subject string, they are
not allowed in a pattern string because it is passed as a normal C string,
terminated by zero. The escape sequence "\0" can be used in the pattern to
represent a binary zero.
5. The following Perl escape sequences are not supported: \l, \u, \L, \U,
\E, \Q. In fact these are implemented by Perl's general string-handling and
are not part of its pattern matching engine.
6. The Perl \G assertion is not supported as it is not relevant to single
pattern matches.
7. Fairly obviously, PCRE does not support the (?{code}) and (?p{code})
constructions. However, there is some experimental support for recursive
patterns using the non-Perl item (?R).
8. There are at the time of writing some oddities in Perl 5.005_02 concerned
with the settings of captured strings when part of a pattern is repeated. For
example, matching "aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2 unset. However, if
the pattern is changed to /^(aa(b(b))?)+$/ then $2 (and $3) are set.
In Perl 5.004 $2 is set in both cases, and that is also true of PCRE. If in the
future Perl changes to a consistent state that is different, PCRE may change to
follow.
9. Another as yet unresolved discrepancy is that in Perl 5.005_02 the pattern
/^(a)?(?(1)a|b)+$/ matches the string "a", whereas in PCRE it does not.
However, in both Perl and PCRE /^(a)?a/ matched against "a" leaves $1 unset.
10. PCRE provides some extensions to the Perl regular expression facilities:
(a) Although lookbehind assertions must match fixed length strings, each
alternative branch of a lookbehind assertion can match a different length of
string. Perl 5.005 requires them all to have the same length.
(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not set, the $ meta-
character matches only at the very end of the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter with no special
meaning is faulted.
(d) If PCRE_UNGREEDY is set, the greediness of the repetition quantifiers is
inverted, that is, by default they are not greedy, but if followed by a
question mark they are.
(e) PCRE_ANCHORED can be used to force a pattern to be tried only at the start
of the subject.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options for
pcre_exec() have no Perl equivalents.
(g) The (?R) construct allows for recursive pattern matching (Perl 5.6 can do
this using the (?p{code}) construct, which PCRE cannot of course support.)
The syntax and semantics of the regular expressions supported by PCRE are
described below. Regular expressions are also described in the Perl
documentation and in a number of other books, some of which have copious
examples. Jeffrey Friedl's "Mastering Regular Expressions", published by
O'Reilly (ISBN 1-56592-257), covers them in great detail.
The description here is intended as reference documentation. The basic
operation of PCRE is on strings of bytes. However, there is the beginnings of
some support for UTF-8 character strings. To use this support you must
configure PCRE to include it, and then call pcre_compile() with the
PCRE_UTF8 option. How this affects the pattern matching is described in the
final section of this document.
A regular expression is a pattern that is matched against a subject string from
left to right. Most characters stand for themselves in a pattern, and match the
corresponding characters in the subject. As a trivial example, the pattern
The quick brown fox
matches a portion of a subject string that is identical to itself. The power of
regular expressions comes from the ability to include alternatives and
repetitions in the pattern. These are encoded in the pattern by the use of
meta-characters, which do not stand for themselves but instead are
interpreted in some special way.
There are two different sets of meta-characters: those that are recognized
anywhere in the pattern except within square brackets, and those that are
recognized in square brackets. Outside square brackets, the meta-characters are
as follows:
\ general escape character with several uses
^ assert start of subject (or line, in multiline mode)
$ assert end of subject (or line, in multiline mode)
. match any character except newline (by default)
[ start character class definition
| start of alternative branch
( start subpattern
) end subpattern
? extends the meaning of (
also 0 or 1 quantifier
also quantifier minimizer
* 0 or more quantifier
+ 1 or more quantifier
{ start min/max quantifier
Part of a pattern that is in square brackets is called a "character class". In
a character class the only meta-characters are:
\ general escape character
^ negate the class, but only if the first character
- indicates character range
] terminates the character class
The following sections describe the use of each of the meta-characters.
The backslash character has several uses. Firstly, if it is followed by a
non-alphameric character, it takes away any special meaning that character may
have. This use of backslash as an escape character applies both inside and
outside character classes.
For example, if you want to match a "*" character, you write "\*" in the
pattern. This applies whether or not the following character would otherwise be
interpreted as a meta-character, so it is always safe to precede a
non-alphameric with "\" to specify that it stands for itself. In particular,
if you want to match a backslash, you write "\\".
If a pattern is compiled with the PCRE_EXTENDED option, whitespace in the
pattern (other than in a character class) and characters between a "#" outside
a character class and the next newline character are ignored. An escaping
backslash can be used to include a whitespace or "#" character as part of the
pattern.
A second use of backslash provides a way of encoding non-printing characters
in patterns in a visible manner. There is no restriction on the appearance of
non-printing characters, apart from the binary zero that terminates a pattern,
but when a pattern is being prepared by text editing, it is usually easier to
use one of the following escape sequences than the binary character it
represents:
\a alarm, that is, the BEL character (hex 07)
\cx "control-x", where x is any character
\e escape (hex 1B)
\f formfeed (hex 0C)
\n newline (hex 0A)
\r carriage return (hex 0D)
\t tab (hex 09)
\xhh character with hex code hh
\ddd character with octal code ddd, or backreference
The precise effect of "\cx" is as follows: if "x" is a lower case letter, it
is converted to upper case. Then bit 6 of the character (hex 40) is inverted.
Thus "\cz" becomes hex 1A, but "\c{" becomes hex 3B, while "\c;" becomes hex
7B.
After "\x", up to two hexadecimal digits are read (letters can be in upper or
lower case).
After "\0" up to two further octal digits are read. In both cases, if there
are fewer than two digits, just those that are present are used. Thus the
sequence "\0\x\07" specifies two binary zeros followed by a BEL character.
Make sure you supply two digits after the initial zero if the character that
follows is itself an octal digit.
The handling of a backslash followed by a digit other than 0 is complicated.
Outside a character class, PCRE reads it and any following digits as a decimal
number. If the number is less than 10, or if there have been at least that many
previous capturing left parentheses in the expression, the entire sequence is
taken as a back reference. A description of how this works is given
later, following the discussion of parenthesized subpatterns.
Inside a character class, or if the decimal number is greater than 9 and there
have not been that many capturing subpatterns, PCRE re-reads up to three octal
digits following the backslash, and generates a single byte from the least
significant 8 bits of the value. Any subsequent digits stand for themselves.
For example:
\040 is another way of writing a space
\40 is the same, provided there are fewer than 40
previous capturing subpatterns
\7 is always a back reference
\11 might be a back reference, or another way of
writing a tab
\011 is always a tab
\0113 is a tab followed by the character "3"
\113 is the character with octal code 113 (since there
can be no more than 99 back references)
\377 is a byte consisting entirely of 1 bits
\81 is either a back reference, or a binary zero
followed by the two characters "8" and "1"
Note that octal values of 100 or greater must not be introduced by a leading
zero, because no more than three octal digits are ever read.
All the sequences that define a single byte value can be used both inside and
outside character classes. In addition, inside a character class, the sequence
"\b" is interpreted as the backspace character (hex 08). Outside a character
class it has a different meaning (see below).
The third use of backslash is for specifying generic character types:
\d any decimal digit
\D any character that is not a decimal digit
\s any whitespace character
\S any character that is not a whitespace character
\w any "word" character
\W any "non-word" character
Each pair of escape sequences partitions the complete set of characters into
two disjoint sets. Any given character matches one, and only one, of each pair.
A "word" character is any letter or digit or the underscore character, that is,
any character which can be part of a Perl "word". The definition of letters and
digits is controlled by PCRE's character tables, and may vary if locale-
specific matching is taking place (see "Locale support" above). For example, in
the "fr" (French) locale, some character codes greater than 128 are used for
accented letters, and these are matched by \w.
These character type sequences can appear both inside and outside character
classes. They each match one character of the appropriate type. If the current
matching point is at the end of the subject string, all of them fail, since
there is no character to match.
The fourth use of backslash is for certain simple assertions. An assertion
specifies a condition that has to be met at a particular point in a match,
without consuming any characters from the subject string. The use of
subpatterns for more complicated assertions is described below. The backslashed
assertions are
\b word boundary
\B not a word boundary
\A start of subject (independent of multiline mode)
\Z end of subject or newline at end (independent of multiline mode)
\z end of subject (independent of multiline mode)
These assertions may not appear in character classes (but note that "\b" has a
different meaning, namely the backspace character, inside a character class).
A word boundary is a position in the subject string where the current character
and the previous character do not both match \w or \W (i.e. one matches
\w and the other matches \W), or the start or end of the string if the
first or last character matches \w, respectively.
The \A, \Z, and \z assertions differ from the traditional circumflex and
dollar (described below) in that they only ever match at the very start and end
of the subject string, whatever options are set. They are not affected by the
PCRE_NOTBOL or PCRE_NOTEOL options. If the startoffset argument of
pcre_exec() is non-zero, \A can never match. The difference between \Z
and \z is that \Z matches before a newline that is the last character of the
string as well as at the end of the string, whereas \z matches only at the
end.
Outside a character class, in the default matching mode, the circumflex
character is an assertion which is true only if the current matching point is
at the start of the subject string. If the startoffset argument of
pcre_exec() is non-zero, circumflex can never match. Inside a character
class, circumflex has an entirely different meaning (see below).
Circumflex need not be the first character of the pattern if a number of
alternatives are involved, but it should be the first thing in each alternative
in which it appears if the pattern is ever to match that branch. If all
possible alternatives start with a circumflex, that is, if the pattern is
constrained to match only at the start of the subject, it is said to be an
"anchored" pattern. (There are also other constructs that can cause a pattern
to be anchored.)
A dollar character is an assertion which is true only if the current matching
point is at the end of the subject string, or immediately before a newline
character that is the last character in the string (by default). Dollar need
not be the last character of the pattern if a number of alternatives are
involved, but it should be the last item in any branch in which it appears.
Dollar has no special meaning in a character class.
The meaning of dollar can be changed so that it matches only at the very end of
the string, by setting the PCRE_DOLLAR_ENDONLY option at compile or matching
time. This does not affect the \Z assertion.
The meanings of the circumflex and dollar characters are changed if the
PCRE_MULTILINE option is set. When this is the case, they match immediately
after and immediately before an internal "\n" character, respectively, in
addition to matching at the start and end of the subject string. For example,
the pattern /^abc$/ matches the subject string "def\nabc" in multiline mode,
but not otherwise. Consequently, patterns that are anchored in single line mode
because all branches start with "^" are not anchored in multiline mode, and a
match for circumflex is possible when the startoffset argument of
pcre_exec() is non-zero. The PCRE_DOLLAR_ENDONLY option is ignored if
PCRE_MULTILINE is set.
Note that the sequences \A, \Z, and \z can be used to match the start and
end of the subject in both modes, and if all branches of a pattern start with
\A is it always anchored, whether PCRE_MULTILINE is set or not.
Outside a character class, a dot in the pattern matches any one character in
the subject, including a non-printing character, but not (by default) newline.
If the PCRE_DOTALL option is set, dots match newlines as well. The handling of
dot is entirely independent of the handling of circumflex and dollar, the only
relationship being that they both involve newline characters. Dot has no
special meaning in a character class.
An opening square bracket introduces a character class, terminated by a closing
square bracket. A closing square bracket on its own is not special. If a
closing square bracket is required as a member of the class, it should be the
first data character in the class (after an initial circumflex, if present) or
escaped with a backslash.
A character class matches a single character in the subject; the character must
be in the set of characters defined by the class, unless the first character in
the class is a circumflex, in which case the subject character must not be in
the set defined by the class. If a circumflex is actually required as a member
of the class, ensure it is not the first character, or escape it with a
backslash.
For example, the character class [aeiou] matches any lower case vowel, while
[^aeiou] matches any character that is not a lower case vowel. Note that a
circumflex is just a convenient notation for specifying the characters which
are in the class by enumerating those that are not. It is not an assertion: it
still consumes a character from the subject string, and fails if the current
pointer is at the end of the string.
When caseless matching is set, any letters in a class represent both their
upper case and lower case versions, so for example, a caseless [aeiou] matches
"A" as well as "a", and a caseless [^aeiou] does not match "A", whereas a
caseful version would.
The newline character is never treated in any special way in character classes,
whatever the setting of the PCRE_DOTALL or PCRE_MULTILINE options is. A class
such as [^a] will always match a newline.
The minus (hyphen) character can be used to specify a range of characters in a
character class. For example, [d-m] matches any letter between d and m,
inclusive. If a minus character is required in a class, it must be escaped with
a backslash or appear in a position where it cannot be interpreted as
indicating a range, typically as the first or last character in the class.
It is not possible to have the literal character "]" as the end character of a
range. A pattern such as [W-]46] is interpreted as a class of two characters
("W" and "-") followed by a literal string "46]", so it would match "W46]" or
"-46]". However, if the "]" is escaped with a backslash it is interpreted as
the end of range, so [W-\]46] is interpreted as a single class containing a
range followed by two separate characters. The octal or hexadecimal
representation of "]" can also be used to end a range.
Ranges operate in ASCII collating sequence. They can also be used for
characters specified numerically, for example [\000-\037]. If a range that
includes letters is used when caseless matching is set, it matches the letters
in either case. For example, [W-c] is equivalent to [][\^_`wxyzabc], matched
caselessly, and if character tables for the "fr" locale are in use,
[\xc8-\xcb] matches accented E characters in both cases.
The character types \d, \D, \s, \S, \w, and \W may also appear in a
character class, and add the characters that they match to the class. For
example, [\dABCDEF] matches any hexadecimal digit. A circumflex can
conveniently be used with the upper case character types to specify a more
restricted set of characters than the matching lower case type. For example,
the class [^\W_] matches any letter or digit, but not underscore.
All non-alphameric characters other than \, -, ^ (at the start) and the
terminating ] are non-special in character classes, but it does no harm if they
are escaped.
Perl 5.6 (not yet released at the time of writing) is going to support the
POSIX notation for character classes, which uses names enclosed by [: and :]
within the enclosing square brackets. PCRE supports this notation. For example,
[01[:alpha:]%]
matches "0", "1", any alphabetic character, or "%". The supported class names
are
alnum letters and digits
alpha letters
ascii character codes 0 - 127
cntrl control characters
digit decimal digits (same as \d)
graph printing characters, excluding space
lower lower case letters
print printing characters, including space
punct printing characters, excluding letters and digits
space white space (same as \s)
upper upper case letters
word "word" characters (same as \w)
xdigit hexadecimal digits
The names "ascii" and "word" are Perl extensions. Another Perl extension is
negation, which is indicated by a ^ character after the colon. For example,
[12[:^digit:]]
matches "1", "2", or any non-digit. PCRE (and Perl) also recogize the POSIX
syntax [.ch.] and [=ch=] where "ch" is a "collating element", but these are not
supported, and an error is given if they are encountered.
Vertical bar characters are used to separate alternative patterns. For example,
the pattern
gilbert|sullivan
matches either "gilbert" or "sullivan". Any number of alternatives may appear,
and an empty alternative is permitted (matching the empty string).
The matching process tries each alternative in turn, from left to right,
and the first one that succeeds is used. If the alternatives are within a
subpattern (defined below), "succeeds" means matching the rest of the main
pattern as well as the alternative in the subpattern.
The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL, and PCRE_EXTENDED
can be changed from within the pattern by a sequence of Perl option letters
enclosed between "(?" and ")". The option letters are
i for PCRE_CASELESS
m for PCRE_MULTILINE
s for PCRE_DOTALL
x for PCRE_EXTENDED
For example, (?im) sets caseless, multiline matching. It is also possible to
unset these options by preceding the letter with a hyphen, and a combined
setting and unsetting such as (?im-sx), which sets PCRE_CASELESS and
PCRE_MULTILINE while unsetting PCRE_DOTALL and PCRE_EXTENDED, is also
permitted. If a letter appears both before and after the hyphen, the option is
unset.
The scope of these option changes depends on where in the pattern the setting
occurs. For settings that are outside any subpattern (defined below), the
effect is the same as if the options were set or unset at the start of
matching. The following patterns all behave in exactly the same way:
(?i)abc
a(?i)bc
ab(?i)c
abc(?i)
which in turn is the same as compiling the pattern abc with PCRE_CASELESS set.
In other words, such "top level" settings apply to the whole pattern (unless
there are other changes inside subpatterns). If there is more than one setting
of the same option at top level, the rightmost setting is used.
If an option change occurs inside a subpattern, the effect is different. This
is a change of behaviour in Perl 5.005. An option change inside a subpattern
affects only that part of the subpattern that follows it, so
(a(?i)b)c
matches abc and aBc and no other strings (assuming PCRE_CASELESS is not used).
By this means, options can be made to have different settings in different
parts of the pattern. Any changes made in one alternative do carry on
into subsequent branches within the same subpattern. For example,
(a(?i)b|c)
matches "ab", "aB", "c", and "C", even though when matching "C" the first
branch is abandoned before the option setting. This is because the effects of
option settings happen at compile time. There would be some very weird
behaviour otherwise.
The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can be changed in the
same way as the Perl-compatible options by using the characters U and X
respectively. The (?X) flag setting is special in that it must always occur
earlier in the pattern than any of the additional features it turns on, even
when it is at top level. It is best put at the start.
Subpatterns are delimited by parentheses (round brackets), which can be nested.
Marking part of a pattern as a subpattern does two things:
1. It localizes a set of alternatives. For example, the pattern
cat(aract|erpillar|)
matches one of the words "cat", "cataract", or "caterpillar". Without the
parentheses, it would match "cataract", "erpillar" or the empty string.
2. It sets up the subpattern as a capturing subpattern (as defined above).
When the whole pattern matches, that portion of the subject string that matched
the subpattern is passed back to the caller via the ovector argument of
pcre_exec(). Opening parentheses are counted from left to right (starting
from 1) to obtain the numbers of the capturing subpatterns.
For example, if the string "the red king" is matched against the pattern
the ((red|white) (king|queen))
the captured substrings are "red king", "red", and "king", and are numbered 1,
2, and 3.
The fact that plain parentheses fulfil two functions is not always helpful.
There are often times when a grouping subpattern is required without a
capturing requirement. If an opening parenthesis is followed by "?:", the
subpattern does not do any capturing, and is not counted when computing the
number of any subsequent capturing subpatterns. For example, if the string "the
white queen" is matched against the pattern
the ((?:red|white) (king|queen))
the captured substrings are "white queen" and "queen", and are numbered 1 and
2. The maximum number of captured substrings is 99, and the maximum number of
all subpatterns, both capturing and non-capturing, is 200.
As a convenient shorthand, if any option settings are required at the start of
a non-capturing subpattern, the option letters may appear between the "?" and
the ":". Thus the two patterns
(?i:saturday|sunday)
(?:(?i)saturday|sunday)
match exactly the same set of strings. Because alternative branches are tried
from left to right, and options are not reset until the end of the subpattern
is reached, an option setting in one branch does affect subsequent branches, so
the above patterns match "SUNDAY" as well as "Saturday".
Repetition is specified by quantifiers, which can follow any of the following
items:
a single character, possibly escaped
the . metacharacter
a character class
a back reference (see next section)
a parenthesized subpattern (unless it is an assertion - see below)
The general repetition quantifier specifies a minimum and maximum number of
permitted matches, by giving the two numbers in curly brackets (braces),
separated by a comma. The numbers must be less than 65536, and the first must
be less than or equal to the second. For example:
z{2,4}
matches "zz", "zzz", or "zzzz". A closing brace on its own is not a special
character. If the second number is omitted, but the comma is present, there is
no upper limit; if the second number and the comma are both omitted, the
quantifier specifies an exact number of required matches. Thus
[aeiou]{3,}
matches at least 3 successive vowels, but may match many more, while
\d{8}
matches exactly 8 digits. An opening curly bracket that appears in a position
where a quantifier is not allowed, or one that does not match the syntax of a
quantifier, is taken as a literal character. For example, {,6} is not a
quantifier, but a literal string of four characters.
The quantifier {0} is permitted, causing the expression to behave as if the
previous item and the quantifier were not present.
For convenience (and historical compatibility) the three most common
quantifiers have single-character abbreviations:
* is equivalent to {0,}
+ is equivalent to {1,}
? is equivalent to {0,1}
It is possible to construct infinite loops by following a subpattern that can
match no characters with a quantifier that has no upper limit, for example:
(a?)*
Earlier versions of Perl and PCRE used to give an error at compile time for
such patterns. However, because there are cases where this can be useful, such
patterns are now accepted, but if any repetition of the subpattern does in fact
match no characters, the loop is forcibly broken.
By default, the quantifiers are "greedy", that is, they match as much as
possible (up to the maximum number of permitted times), without causing the
rest of the pattern to fail. The classic example of where this gives problems
is in trying to match comments in C programs. These appear between the
sequences /* and */ and within the sequence, individual * and / characters may
appear. An attempt to match C comments by applying the pattern
/\*.*\*/
to the string
/* first command */ not comment /* second comment */
fails, because it matches the entire string owing to the greediness of the .*
item.
However, if a quantifier is followed by a question mark, it ceases to be
greedy, and instead matches the minimum number of times possible, so the
pattern
/\*.*?\*/
does the right thing with the C comments. The meaning of the various
quantifiers is not otherwise changed, just the preferred number of matches.
Do not confuse this use of question mark with its use as a quantifier in its
own right. Because it has two uses, it can sometimes appear doubled, as in
\d??\d
which matches one digit by preference, but can match two if that is the only
way the rest of the pattern matches.
If the PCRE_UNGREEDY option is set (an option which is not available in Perl),
the quantifiers are not greedy by default, but individual ones can be made
greedy by following them with a question mark. In other words, it inverts the
default behaviour.
When a parenthesized subpattern is quantified with a minimum repeat count that
is greater than 1 or with a limited maximum, more store is required for the
compiled pattern, in proportion to the size of the minimum or maximum.
If a pattern starts with .* or .{0,} and the PCRE_DOTALL option (equivalent
to Perl's /s) is set, thus allowing the . to match newlines, the pattern is
implicitly anchored, because whatever follows will be tried against every
character position in the subject string, so there is no point in retrying the
overall match at any position after the first. PCRE treats such a pattern as
though it were preceded by \A. In cases where it is known that the subject
string contains no newlines, it is worth setting PCRE_DOTALL when the pattern
begins with .* in order to obtain this optimization, or alternatively using ^
to indicate anchoring explicitly.
When a capturing subpattern is repeated, the value captured is the substring
that matched the final iteration. For example, after
(tweedle[dume]{3}\s*)+
has matched "tweedledum tweedledee" the value of the captured substring is
"tweedledee". However, if there are nested capturing subpatterns, the
corresponding captured values may have been set in previous iterations. For
example, after
/(a|(b))+/
matches "aba" the value of the second captured substring is "b".
Outside a character class, a backslash followed by a digit greater than 0 (and
possibly further digits) is a back reference to a capturing subpattern earlier
(i.e. to its left) in the pattern, provided there have been that many previous
capturing left parentheses.
However, if the decimal number following the backslash is less than 10, it is
always taken as a back reference, and causes an error only if there are not
that many capturing left parentheses in the entire pattern. In other words, the
parentheses that are referenced need not be to the left of the reference for
numbers less than 10. See the section entitled "Backslash" above for further
details of the handling of digits following a backslash.
A back reference matches whatever actually matched the capturing subpattern in
the current subject string, rather than anything matching the subpattern
itself. So the pattern
(sens|respons)e and \1ibility
matches "sense and sensibility" and "response and responsibility", but not
"sense and responsibility". If caseful matching is in force at the time of the
back reference, the case of letters is relevant. For example,
((?i)rah)\s+\1
matches "rah rah" and "RAH RAH", but not "RAH rah", even though the original
capturing subpattern is matched caselessly.
There may be more than one back reference to the same subpattern. If a
subpattern has not actually been used in a particular match, any back
references to it always fail. For example, the pattern
(a|(bc))\2
always fails if it starts to match "a" rather than "bc". Because there may be
up to 99 back references, all digits following the backslash are taken
as part of a potential back reference number. If the pattern continues with a
digit character, some delimiter must be used to terminate the back reference.
If the PCRE_EXTENDED option is set, this can be whitespace. Otherwise an empty
comment can be used.
A back reference that occurs inside the parentheses to which it refers fails
when the subpattern is first used, so, for example, (a\1) never matches.
However, such references can be useful inside repeated subpatterns. For
example, the pattern
(a|b\1)+
matches any number of "a"s and also "aba", "ababbaa" etc. At each iteration of
the subpattern, the back reference matches the character string corresponding
to the previous iteration. In order for this to work, the pattern must be such
that the first iteration does not need to match the back reference. This can be
done using alternation, as in the example above, or by a quantifier with a
minimum of zero.
An assertion is a test on the characters following or preceding the current
matching point that does not actually consume any characters. The simple
assertions coded as \b, \B, \A, \Z, \z, ^ and $ are described above. More
complicated assertions are coded as subpatterns. There are two kinds: those
that look ahead of the current position in the subject string, and those that
look behind it.
An assertion subpattern is matched in the normal way, except that it does not
cause the current matching position to be changed. Lookahead assertions start
with (?= for positive assertions and (?! for negative assertions. For example,
\w+(?=;)
matches a word followed by a semicolon, but does not include the semicolon in
the match, and
foo(?!bar)
matches any occurrence of "foo" that is not followed by "bar". Note that the
apparently similar pattern
(?!foo)bar
does not find an occurrence of "bar" that is preceded by something other than
"foo"; it finds any occurrence of "bar" whatsoever, because the assertion
(?!foo) is always true when the next three characters are "bar". A
lookbehind assertion is needed to achieve this effect.
Lookbehind assertions start with (?<= for positive assertions and (?<! for
negative assertions. For example,
(?<!foo)bar
does find an occurrence of "bar" that is not preceded by "foo". The contents of
a lookbehind assertion are restricted such that all the strings it matches must
have a fixed length. However, if there are several alternatives, they do not
all have to have the same fixed length. Thus
(?<=bullock|donkey)
is permitted, but
(?<!dogs?|cats?)
causes an error at compile time. Branches that match different length strings
are permitted only at the top level of a lookbehind assertion. This is an
extension compared with Perl 5.005, which requires all branches to match the
same length of string. An assertion such as
(?<=ab(c|de))
is not permitted, because its single top-level branch can match two different
lengths, but it is acceptable if rewritten to use two top-level branches:
(?<=abc|abde)
The implementation of lookbehind assertions is, for each alternative, to
temporarily move the current position back by the fixed width and then try to
match. If there are insufficient characters before the current position, the
match is deemed to fail. Lookbehinds in conjunction with once-only subpatterns
can be particularly useful for matching at the ends of strings; an example is
given at the end of the section on once-only subpatterns.
Several assertions (of any sort) may occur in succession. For example,
(?<=\d{3})(?<!999)foo
matches "foo" preceded by three digits that are not "999". Notice that each of
the assertions is applied independently at the same point in the subject
string. First there is a check that the previous three characters are all
digits, and then there is a check that the same three characters are not "999".
This pattern does not match "foo" preceded by six characters, the first
of which are digits and the last three of which are not "999". For example, it
doesn't match "123abcfoo". A pattern to do that is
(?<=\d{3}...)(?<!999)foo
This time the first assertion looks at the preceding six characters, checking
that the first three are digits, and then the second assertion checks that the
preceding three characters are not "999".
Assertions can be nested in any combination. For example,
(?<=(?<!foo)bar)baz
matches an occurrence of "baz" that is preceded by "bar" which in turn is not
preceded by "foo", while
(?<=\d{3}(?!999)...)foo
is another pattern which matches "foo" preceded by three digits and any three
characters that are not "999".
Assertion subpatterns are not capturing subpatterns, and may not be repeated,
because it makes no sense to assert the same thing several times. If any kind
of assertion contains capturing subpatterns within it, these are counted for
the purposes of numbering the capturing subpatterns in the whole pattern.
However, substring capturing is carried out only for positive assertions,
because it does not make sense for negative assertions.
Assertions count towards the maximum of 200 parenthesized subpatterns.
With both maximizing and minimizing repetition, failure of what follows
normally causes the repeated item to be re-evaluated to see if a different
number of repeats allows the rest of the pattern to match. Sometimes it is
useful to prevent this, either to change the nature of the match, or to cause
it fail earlier than it otherwise might, when the author of the pattern knows
there is no point in carrying on.
Consider, for example, the pattern \d+foo when applied to the subject line
123456bar
After matching all 6 digits and then failing to match "foo", the normal
action of the matcher is to try again with only 5 digits matching the \d+
item, and then with 4, and so on, before ultimately failing. Once-only
subpatterns provide the means for specifying that once a portion of the pattern
has matched, it is not to be re-evaluated in this way, so the matcher would
give up immediately on failing to match "foo" the first time. The notation is
another kind of special parenthesis, starting with (?> as in this example:
(?>\d+)bar
This kind of parenthesis "locks up" the part of the pattern it contains once
it has matched, and a failure further into the pattern is prevented from
backtracking into it. Backtracking past it to previous items, however, works as
normal.
An alternative description is that a subpattern of this type matches the string
of characters that an identical standalone pattern would match, if anchored at
the current point in the subject string.
Once-only subpatterns are not capturing subpatterns. Simple cases such as the
above example can be thought of as a maximizing repeat that must swallow
everything it can. So, while both \d+ and \d+? are prepared to adjust the
number of digits they match in order to make the rest of the pattern match,
(?>\d+) can only match an entire sequence of digits.
This construction can of course contain arbitrarily complicated subpatterns,
and it can be nested.
Once-only subpatterns can be used in conjunction with lookbehind assertions to
specify efficient matching at the end of the subject string. Consider a simple
pattern such as
abcd$
when applied to a long string which does not match. Because matching proceeds
from left to right, PCRE will look for each "a" in the subject and then see if
what follows matches the rest of the pattern. If the pattern is specified as
^.*abcd$
the initial .* matches the entire string at first, but when this fails (because
there is no following "a"), it backtracks to match all but the last character,
then all but the last two characters, and so on. Once again the search for "a"
covers the entire string, from right to left, so we are no better off. However,
if the pattern is written as
^(?>.*)(?<=abcd)
there can be no backtracking for the .* item; it can match only the entire
string. The subsequent lookbehind assertion does a single test on the last four
characters. If it fails, the match fails immediately. For long strings, this
approach makes a significant difference to the processing time.
When a pattern contains an unlimited repeat inside a subpattern that can itself
be repeated an unlimited number of times, the use of a once-only subpattern is
the only way to avoid some failing matches taking a very long time indeed.
The pattern
(\D+|<\d+>)*[!?]
matches an unlimited number of substrings that either consist of non-digits, or
digits enclosed in <>, followed by either ! or ?. When it matches, it runs
quickly. However, if it is applied to
it takes a long time before reporting failure. This is because the string can
be divided between the two repeats in a large number of ways, and all have to
be tried. (The example used [!?] rather than a single character at the end,
because both PCRE and Perl have an optimization that allows for fast failure
when a single character is used. They remember the last single character that
is required for a match, and fail early if it is not present in the string.)
If the pattern is changed to
((?>\D+)|<\d+>)*[!?]
sequences of non-digits cannot be broken, and failure happens quickly.
It is possible to cause the matching process to obey a subpattern
conditionally or to choose between two alternative subpatterns, depending on
the result of an assertion, or whether a previous capturing subpattern matched
or not. The two possible forms of conditional subpattern are
If the condition is satisfied, the yes-pattern is used; otherwise the
no-pattern (if present) is used. If there are more than two alternatives in the
subpattern, a compile-time error occurs.
There are two kinds of condition. If the text between the parentheses consists
of a sequence of digits, the condition is satisfied if the capturing subpattern
of that number has previously matched. The number must be greater than zero.
Consider the following pattern, which contains non-significant white space to
make it more readable (assume the PCRE_EXTENDED option) and to divide it into
three parts for ease of discussion:
( \( )? [^()]+ (?(1) \) )
The first part matches an optional opening parenthesis, and if that
character is present, sets it as the first captured substring. The second part
matches one or more characters that are not parentheses. The third part is a
conditional subpattern that tests whether the first set of parentheses matched
or not. If they did, that is, if subject started with an opening parenthesis,
the condition is true, and so the yes-pattern is executed and a closing
parenthesis is required. Otherwise, since no-pattern is not present, the
subpattern matches nothing. In other words, this pattern matches a sequence of
non-parentheses, optionally enclosed in parentheses.
If the condition is not a sequence of digits, it must be an assertion. This may
be a positive or negative lookahead or lookbehind assertion. Consider this
pattern, again containing non-significant white space, and with the two
alternatives on the second line:
The condition is a positive lookahead assertion that matches an optional
sequence of non-letters followed by a letter. In other words, it tests for the
presence of at least one letter in the subject. If a letter is found, the
subject is matched against the first alternative; otherwise it is matched
against the second. This pattern matches strings in one of the two forms
dd-aaa-dd or dd-dd-dd, where aaa are letters and dd are digits.
The sequence (?# marks the start of a comment which continues up to the next
closing parenthesis. Nested parentheses are not permitted. The characters
that make up a comment play no part in the pattern matching at all.
If the PCRE_EXTENDED option is set, an unescaped # character outside a
character class introduces a comment that continues up to the next newline
character in the pattern.
Consider the problem of matching a string in parentheses, allowing for
unlimited nested parentheses. Without the use of recursion, the best that can
be done is to use a pattern that matches up to some fixed depth of nesting. It
is not possible to handle an arbitrary nesting depth. Perl 5.6 has provided an
experimental facility that allows regular expressions to recurse (amongst other
things). It does this by interpolating Perl code in the expression at run time,
and the code can refer to the expression itself. A Perl pattern to solve the
parentheses problem can be created like this:
$re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
The (?p{...}) item interpolates Perl code at run time, and in this case refers
recursively to the pattern in which it appears. Obviously, PCRE cannot support
the interpolation of Perl code. Instead, the special item (?R) is provided for
the specific case of recursion. This PCRE pattern solves the parentheses
problem (assume the PCRE_EXTENDED option is set so that white space is
ignored):
\( ( (?>[^()]+) | (?R) )* \)
First it matches an opening parenthesis. Then it matches any number of
substrings which can either be a sequence of non-parentheses, or a recursive
match of the pattern itself (i.e. a correctly parenthesized substring). Finally
there is a closing parenthesis.
This particular example pattern contains nested unlimited repeats, and so the
use of a once-only subpattern for matching strings of non-parentheses is
important when applying the pattern to strings that do not match. For example,
when it is applied to
it yields "no match" quickly. However, if a once-only subpattern is not used,
the match runs for a very long time indeed because there are so many different
ways the + and * repeats can carve up the subject, and all have to be tested
before failure can be reported.
The values set for any capturing subpatterns are those from the outermost level
of the recursion at which the subpattern value is set. If the pattern above is
matched against
(ab(cd)ef)
the value for the capturing parentheses is "ef", which is the last value taken
on at the top level. If additional parentheses are added, giving
\( ( ( (?>[^()]+) | (?R) )* ) \)
^ ^
^ ^
the string they capture is "ab(cd)ef", the contents of the top level
parentheses. If there are more than 15 capturing parentheses in a pattern, PCRE
has to obtain extra memory to store data during a recursion, which it does by
using pcre_malloc, freeing it via pcre_free afterwards. If no
memory can be obtained, it saves data for the first 15 capturing parentheses
only, as there is no way to give an out-of-memory error from within a
recursion.
Certain items that may appear in patterns are more efficient than others. It is
more efficient to use a character class like [aeiou] than a set of alternatives
such as (a|e|i|o|u). In general, the simplest construction that provides the
required behaviour is usually the most efficient. Jeffrey Friedl's book
contains a lot of discussion about optimizing regular expressions for efficient
performance.
When a pattern begins with .* and the PCRE_DOTALL option is set, the pattern is
implicitly anchored by PCRE, since it can match only at the start of a subject
string. However, if PCRE_DOTALL is not set, PCRE cannot make this optimization,
because the . metacharacter does not then match a newline, and if the subject
string contains newlines, the pattern may match from the character immediately
following one of them instead of from the very start. For example, the pattern
(.*) second
matches the subject "first\nand second" (where \n stands for a newline
character) with the first captured substring being "and". In order to do this,
PCRE has to retry the match starting after every newline in the subject.
If you are using such a pattern with subject strings that do not contain
newlines, the best performance is obtained by setting PCRE_DOTALL, or starting
the pattern with ^.* to indicate explicit anchoring. That saves PCRE from
having to scan along the subject looking for a newline to restart at.
Beware of patterns that contain nested indefinite repeats. These can take a
long time to run when applied to a string that does not match. Consider the
pattern fragment
(a+)*
This can match "aaaa" in 33 different ways, and this number increases very
rapidly as the string gets longer. (The * repeat can match 0, 1, 2, 3, or 4
times, and for each of those cases other than 0, the + repeats can match
different numbers of times.) When the remainder of the pattern is such that the
entire match is going to fail, PCRE has in principle to try every possible
variation, and this can take an extremely long time.
An optimization catches some of the more simple cases such as
(a+)*b
where a literal character follows. Before embarking on the standard matching
procedure, PCRE checks that there is a "b" later in the subject string, and if
there is not, it fails the match immediately. However, when there is no
following literal this optimization cannot be used. You can see the difference
by comparing the behaviour of
(a+)*\d
with the pattern above. The former gives a failure almost instantly when
applied to a whole line of "a" characters, whereas the latter takes an
appreciable time with strings longer than about 20 characters.
Starting at release 3.3, PCRE has some support for character strings encoded
in the UTF-8 format. This is incomplete, and is regarded as experimental. In
order to use it, you must configure PCRE to include UTF-8 support in the code,
and, in addition, you must call pcre_compile() with the PCRE_UTF8 option
flag. When you do this, both the pattern and any subject strings that are
matched against it are treated as UTF-8 strings instead of just strings of
bytes, but only in the cases that are mentioned below.
If you compile PCRE with UTF-8 support, but do not use it at run time, the
library will be a bit bigger, but the additional run time overhead is limited
to testing the PCRE_UTF8 flag in several places, so should not be very large.
PCRE assumes that the strings it is given contain valid UTF-8 codes. It does
not diagnose invalid UTF-8 strings. If you pass invalid UTF-8 strings to PCRE,
the results are undefined.
Running with PCRE_UTF8 set causes these changes in the way PCRE works:
1. In a pattern, the escape sequence \x{...}, where the contents of the braces
is a string of hexadecimal digits, is interpreted as a UTF-8 character whose
code number is the given hexadecimal number, for example: \x{1234}. This
inserts from one to six literal bytes into the pattern, using the UTF-8
encoding. If a non-hexadecimal digit appears between the braces, the item is
not recognized.
2. The original hexadecimal escape sequence, \xhh, generates a two-byte UTF-8
character if its value is greater than 127.
3. Repeat quantifiers are NOT correctly handled if they follow a multibyte
character. For example, \x{100}* and \xc3+ do not work. If you want to
repeat such characters, you must enclose them in non-capturing parentheses,
for example (?:\x{100}), at present.
4. The dot metacharacter matches one UTF-8 character instead of a single byte.
5. Unlike literal UTF-8 characters, the dot metacharacter followed by a
repeat quantifier does operate correctly on UTF-8 characters instead of
single bytes.
4. Although the \x{...} escape is permitted in a character class, characters
whose values are greater than 255 cannot be included in a class.
5. A class is matched against a UTF-8 character instead of just a single byte,
but it can match only characters whose values are less than 256. Characters
with greater values always fail to match a class.
6. Repeated classes work correctly on multiple characters.
7. Classes containing just a single character whose value is greater than 127
(but less than 256), for example, [\x80] or [^\x{93}], do not work because
these are optimized into single byte matches. In the first case, of course,
the class brackets are just redundant.
8. Lookbehind assertions move backwards in the subject by a fixed number of
characters instead of a fixed number of bytes. Simple cases have been tested
to work correctly, but there may be hidden gotchas herein.
9. The character types such as \d and \w do not work correctly with UTF-8
characters. They continue to test a single byte.
10. Anything not explicitly mentioned here continues to work in bytes rather
than in characters.
The following UTF-8 features of Perl 5.6 are not implemented:
1. The escape sequence \C to match a single byte.
2. The use of Unicode tables and properties and escapes \p, \P, and \X.
Philip Hazel <ph10@cam.ac.uk>
University Computing Service,
New Museums Site,
Cambridge CB2 3QG, England.
Phone: +44 1223 334714
Last updated: 28 August 2000,
the 250th anniversary of the death of J.S. Bach.
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/authors 000640 001751 001751 00000000263 10546014077 017075 0 ustar 00fk fk 000000 000000 Written by: Philip Hazel
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2000 University of Cambridge
privoxy-3.0.21-stable/./pcre/doc/readme 000640 001751 001751 00000030357 10546014100 016637 0 ustar 00fk fk 000000 000000 README file for PCRE (Perl-compatible regular expression library)
-----------------------------------------------------------------
The latest release of PCRE is always available from
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/pcre-xxx.tar.gz
Please read the NEWS file if you are upgrading from a previous release.
PCRE has its own native API, but a set of "wrapper" functions that are based on
the POSIX API are also supplied in the library libpcreposix. Note that this
just provides a POSIX calling interface to PCRE: the regular expressions
themselves still follow Perl syntax and semantics. The header file
for the POSIX-style functions is called pcreposix.h. The official POSIX name is
regex.h, but I didn't want to risk possible problems with existing files of
that name by distributing it that way. To use it with an existing program that
uses the POSIX API, it will have to be renamed or pointed at by a link.
Building PCRE on a Unix system
------------------------------
To build PCRE on a Unix system, run the "configure" command in the PCRE
distribution directory. This is a standard GNU "autoconf" configuration script,
for which generic instructions are supplied in INSTALL. On many systems just
running "./configure" is sufficient, but the usual methods of changing standard
defaults are available. For example,
CFLAGS='-O2 -Wall' ./configure --prefix=/opt/local
specifies that the C compiler should be run with the flags '-O2 -Wall' instead
of the default, and that "make install" should install PCRE under /opt/local
instead of the default /usr/local.
If you want to make use of the experimential, incomplete support for UTF-8
character strings in PCRE, you must add --enable-utf8 to the "configure"
command. Without it, the code for handling UTF-8 is not included in the
library. (Even when included, it still has to be enabled by an option at run
time.)
The "configure" script builds four files:
. Makefile is built by copying Makefile.in and making substitutions.
. config.h is built by copying config.in and making substitutions.
. pcre-config is built by copying pcre-config.in and making substitutions.
. RunTest is a script for running tests
Once "configure" has run, you can run "make". It builds two libraries called
libpcre and libpcreposix, a test program called pcretest, and the pcregrep
command. You can use "make install" to copy these, and the public header file
pcre.h, to appropriate live directories on your system, in the normal way.
Running "make install" also installs the command pcre-config, which can be used
to recall information about the PCRE configuration and installation. For
example,
pcre-config --version
prints the version number, and
pcre-config --libs
outputs information about where the library is installed. This command can be
included in makefiles for programs that use PCRE, saving the programmer from
having to remember too many details.
Shared libraries on Unix systems
--------------------------------
The default distribution builds PCRE as two shared libraries. This support is
new and experimental and may not work on all systems. It relies on the
"libtool" scripts - these are distributed with PCRE. It should build a
"libtool" script and use this to compile and link shared libraries, which are
placed in a subdirectory called .libs. The programs pcretest and pcregrep are
built to use these uninstalled libraries by means of wrapper scripts. When you
use "make install" to install shared libraries, pcregrep and pcretest are
automatically re-built to use the newly installed libraries. However, only
pcregrep is installed, as pcretest is really just a test program.
To build PCRE using static libraries you must use --disable-shared when
configuring it. For example
./configure --prefix=/usr/gnu --disable-shared
Then run "make" in the usual way.
Building on non-Unix systems
----------------------------
For a non-Unix system, read the comments in the file NON-UNIX-USE. PCRE has
been compiled on Windows systems and on Macintoshes, but I don't know the
details because I don't use those systems. It should be straightforward to
build PCRE on any system that has a Standard C compiler, because it uses only
Standard C functions.
Testing PCRE
------------
To test PCRE on a Unix system, run the RunTest script in the pcre directory.
(This can also be run by "make runtest", "make check", or "make test".) For
other systems, see the instruction in NON-UNIX-USE.
The script runs the pcretest test program (which is documented in
doc/pcretest.txt) on each of the testinput files (in the testdata directory) in
turn, and compares the output with the contents of the corresponding testoutput
file. A file called testtry is used to hold the output from pcretest. To run
pcretest on just one of the test files, give its number as an argument to
RunTest, for example:
RunTest 3
The first and third test files can also be fed directly into the perltest
script to check that Perl gives the same results. The third file requires the
additional features of release 5.005, which is why it is kept separate from the
main test input, which needs only Perl 5.004. In the long run, when 5.005 (or
higher) is widespread, these two test files may get amalgamated.
The second set of tests check pcre_fullinfo(), pcre_info(), pcre_study(),
pcre_copy_substring(), pcre_get_substring(), pcre_get_substring_list(), error
detection, and run-time flags that are specific to PCRE, as well as the POSIX
wrapper API. It also uses the debugging flag to check some of the internals of
pcre_compile().
If you build PCRE with a locale setting that is not the standard C locale, the
character tables may be different (see next paragraph). In some cases, this may
cause failures in the second set of tests. For example, in a locale where the
isprint() function yields TRUE for characters in the range 128-255, the use of
[:isascii:] inside a character class defines a different set of characters, and
this shows up in this test as a difference in the compiled code, which is being
listed for checking. Where the comparison test output contains [\x00-\x7f] the
test will contain [\x00-\xff], and similarly in some other cases. This is not a
bug in PCRE.
The fourth set of tests checks pcre_maketables(), the facility for building a
set of character tables for a specific locale and using them instead of the
default tables. The tests make use of the "fr" (French) locale. Before running
the test, the script checks for the presence of this locale by running the
"locale" command. If that command fails, or if it doesn't include "fr" in the
list of available locales, the fourth test cannot be run, and a comment is
output to say why. If running this test produces instances of the error
** Failed to set locale "fr"
in the comparison output, it means that locale is not available on your system,
despite being listed by "locale". This does not mean that PCRE is broken.
The fifth test checks the experimental, incomplete UTF-8 support. It is not run
automatically unless PCRE is built with UTF-8 support. This file can be fed
directly to the perltest8 script, which requires Perl 5.6 or higher. The sixth
file tests internal UTF-8 features of PCRE that are not relevant to Perl.
Character tables
----------------
PCRE uses four tables for manipulating and identifying characters. The final
argument of the pcre_compile() function is a pointer to a block of memory
containing the concatenated tables. A call to pcre_maketables() can be used to
generate a set of tables in the current locale. If the final argument for
pcre_compile() is passed as NULL, a set of default tables that is built into
the binary is used.
The source file called chartables.c contains the default set of tables. This is
not supplied in the distribution, but is built by the program dftables
(compiled from dftables.c), which uses the ANSI C character handling functions
such as isalnum(), isalpha(), isupper(), islower(), etc. to build the table
sources. This means that the default C locale which is set for your system will
control the contents of these default tables. You can change the default tables
by editing chartables.c and then re-building PCRE. If you do this, you should
probably also edit Makefile to ensure that the file doesn't ever get
re-generated.
The first two 256-byte tables provide lower casing and case flipping functions,
respectively. The next table consists of three 32-byte bit maps which identify
digits, "word" characters, and white space, respectively. These are used when
building 32-byte bit maps that represent character classes.
The final 256-byte table has bits indicating various character types, as
follows:
1 white space character
2 letter
4 decimal digit
8 hexadecimal digit
16 alphanumeric or '_'
128 regular expression metacharacter or binary zero
You should not alter the set of characters that contain the 128 bit, as that
will cause PCRE to malfunction.
Manifest
--------
The distribution should contain the following files:
(A) The actual source files of the PCRE library functions and their
headers:
dftables.c auxiliary program for building chartables.c
get.c )
maketables.c )
study.c ) source of
pcre.c ) the functions
pcreposix.c )
pcre.in "source" for the header for the external API; pcre.h
is built from this by "configure"
pcreposix.h header for the external POSIX wrapper API
internal.h header for internal use
config.in template for config.h, which is built by configure
(B) Auxiliary files:
AUTHORS information about the author of PCRE
ChangeLog log of changes to the code
INSTALL generic installation instructions
LICENCE conditions for the use of PCRE
COPYING the same, using GNU's standard name
Makefile.in template for Unix Makefile, which is built by configure
NEWS important changes in this release
NON-UNIX-USE notes on building PCRE on non-Unix systems
README this file
RunTest.in template for a Unix shell script for running tests
config.guess ) files used by libtool,
config.sub ) used only when building a shared library
configure a configuring shell script (built by autoconf)
configure.in the autoconf input used to build configure
doc/Tech.Notes notes on the encoding
doc/pcre.3 man page source for the PCRE functions
doc/pcre.html HTML version
doc/pcre.txt plain text version
doc/pcreposix.3 man page source for the POSIX wrapper API
doc/pcreposix.html HTML version
doc/pcreposix.txt plain text version
doc/pcretest.txt documentation of test program
doc/perltest.txt documentation of Perl test program
doc/pcregrep.1 man page source for the pcregrep utility
doc/pcregrep.html HTML version
doc/pcregrep.txt plain text version
install-sh a shell script for installing files
ltconfig ) files used to build "libtool",
ltmain.sh ) used only when building a shared library
pcretest.c test program
perltest Perl test program
perltest8 Perl test program for UTF-8 tests
pcregrep.c source of a grep utility that uses PCRE
pcre-config.in source of script which retains PCRE information
testdata/testinput1 test data, compatible with Perl 5.004 and 5.005
testdata/testinput2 test data for error messages and non-Perl things
testdata/testinput3 test data, compatible with Perl 5.005
testdata/testinput4 test data for locale-specific tests
testdata/testinput5 test data for UTF-8 tests compatible with Perl 5.6
testdata/testinput6 test data for other UTF-8 tests
testdata/testoutput1 test results corresponding to testinput1
testdata/testoutput2 test results corresponding to testinput2
testdata/testoutput3 test results corresponding to testinput3
testdata/testoutput4 test results corresponding to testinput4
testdata/testoutput5 test results corresponding to testinput5
testdata/testoutput6 test results corresponding to testinput6
(C) Auxiliary files for Win32 DLL
dll.mk
pcre.def
Philip Hazel
August 2000
privoxy-3.0.21-stable/./pcre/doc/copying 000640 001751 001751 00000003361 10546014077 017062 0 ustar 00fk fk 000000 000000 PCRE LICENCE
------------
PCRE is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language.
Written by: Philip Hazel
University of Cambridge Computing Service,
Cambridge, England. Phone: +44 1223 334714.
Copyright (c) 1997-2000 University of Cambridge
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission. In practice, this means that if you use
PCRE in software which you distribute to others, commercially or
otherwise, you must put a sentence like this
Regular expression support is provided by the PCRE library package,
which is open source software, written by Philip Hazel, and copyright
by the University of Cambridge, England.
somewhere reasonably visible in your documentation and in any relevant
files or online help data or similar. A reference to the ftp site for
the source, that is, to
ftp://ftp.csx.cam.ac.uk/pub/software/programming/pcre/
should also be given in the documentation.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
End
privoxy-3.0.21-stable/./pcre/doc/pcreposix.txt 000640 001751 001751 00000014026 10546014100 020227 0 ustar 00fk fk 000000 000000 NAME
pcreposix - POSIX API for Perl-compatible regular expres-
sions.
SYNOPSIS
#include
int regcomp(regex_t *preg, const char *pattern,
int cflags);
int regexec(regex_t *preg, const char *string,
size_t nmatch, regmatch_t pmatch[], int eflags);
size_t regerror(int errcode, const regex_t *preg,
char *errbuf, size_t errbuf_size);
void regfree(regex_t *preg);
DESCRIPTION
This set of functions provides a POSIX-style API to the PCRE
regular expression package. See the pcre documentation for a
description of the native API, which contains additional
functionality.
The functions described here are just wrapper functions that
ultimately call the native API. Their prototypes are defined
in the pcreposix.h header file, and on Unix systems the
library itself is called pcreposix.a, so can be accessed by
adding -lpcreposix to the command for linking an application
which uses them. Because the POSIX functions call the native
ones, it is also necessary to add -lpcre.
I have implemented only those option bits that can be rea-
sonably mapped to PCRE native options. In addition, the
options REG_EXTENDED and REG_NOSUB are defined with the
value zero. They have no effect, but since programs that are
written to the POSIX interface often use them, this makes it
easier to slot in PCRE as a replacement library. Other POSIX
options are not even defined.
When PCRE is called via these functions, it is only the API
that is POSIX-like in style. The syntax and semantics of the
regular expressions themselves are still those of Perl, sub-
ject to the setting of various PCRE options, as described
below.
The header for these functions is supplied as pcreposix.h to
avoid any potential clash with other POSIX libraries. It
can, of course, be renamed or aliased as regex.h, which is
the "correct" name. It provides two structure types, regex_t
for compiled internal forms, and regmatch_t for returning
captured substrings. It also defines some constants whose
names start with "REG_"; these are used for setting options
and identifying error codes.
COMPILING A PATTERN
The function regcomp() is called to compile a pattern into
an internal form. The pattern is a C string terminated by a
binary zero, and is passed in the argument pattern. The preg
argument is a pointer to a regex_t structure which is used
as a base for storing information about the compiled expres-
sion.
The argument cflags is either zero, or contains one or more
of the bits defined by the following macros:
REG_ICASE
The PCRE_CASELESS option is set when the expression is
passed for compilation to the native function.
REG_NEWLINE
The PCRE_MULTILINE option is set when the expression is
passed for compilation to the native function.
In the absence of these flags, no options are passed to the
native function. This means the the regex is compiled with
PCRE default semantics. In particular, the way it handles
newline characters in the subject string is the Perl way,
not the POSIX way. Note that setting PCRE_MULTILINE has only
some of the effects specified for REG_NEWLINE. It does not
affect the way newlines are matched by . (they aren't) or a
negative class such as [^a] (they are).
The yield of regcomp() is zero on success, and non-zero oth-
erwise. The preg structure is filled in on success, and one
member of the structure is publicized: re_nsub contains the
number of capturing subpatterns in the regular expression.
Various error codes are defined in the header file.
MATCHING A PATTERN
The function regexec() is called to match a pre-compiled
pattern preg against a given string, which is terminated by
a zero byte, subject to the options in eflags. These can be:
REG_NOTBOL
The PCRE_NOTBOL option is set when calling the underlying
PCRE matching function.
REG_NOTEOL
The PCRE_NOTEOL option is set when calling the underlying
PCRE matching function.
The portion of the string that was matched, and also any
captured substrings, are returned via the pmatch argument,
which points to an array of nmatch structures of type
regmatch_t, containing the members rm_so and rm_eo. These
contain the offset to the first character of each substring
and the offset to the first character after the end of each
substring, respectively. The 0th element of the vector
relates to the entire portion of string that was matched;
subsequent elements relate to the capturing subpatterns of
the regular expression. Unused entries in the array have
both structure members set to -1.
A successful match yields a zero return; various error codes
are defined in the header file, of which REG_NOMATCH is the
"expected" failure code.
ERROR MESSAGES
The regerror() function maps a non-zero errorcode from
either regcomp or regexec to a printable message. If preg is
not NULL, the error should have arisen from the use of that
structure. A message terminated by a binary zero is placed
in errbuf. The length of the message, including the zero, is
limited to errbuf_size. The yield of the function is the
size of buffer needed to hold the whole message.
STORAGE
Compiling a regular expression causes memory to be allocated
and associated with the preg structure. The function reg-
free() frees all such memory, after which preg may no longer
be used as a compiled expression.
AUTHOR
Philip Hazel
University Computing Service,
New Museums Site,
Cambridge CB2 3QG, England.
Phone: +44 1223 334714
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/pcretest.txt 000640 001751 001751 00000024456 10546014100 020054 0 ustar 00fk fk 000000 000000 The pcretest program
--------------------
This program is intended for testing PCRE, but it can also be used for
experimenting with regular expressions.
If it is given two filename arguments, it reads from the first and writes to
the second. If it is given only one filename argument, it reads from that file
and writes to stdout. Otherwise, it reads from stdin and writes to stdout, and
prompts for each line of input, using "re>" to prompt for regular expressions,
and "data>" to prompt for data lines.
The program handles any number of sets of input on a single input file. Each
set starts with a regular expression, and continues with any number of data
lines to be matched against the pattern. An empty line signals the end of the
data lines, at which point a new regular expression is read. The regular
expressions are given enclosed in any non-alphameric delimiters other than
backslash, for example
/(a|bc)x+yz/
White space before the initial delimiter is ignored. A regular expression may
be continued over several input lines, in which case the newline characters are
included within it. See the test input files in the testdata directory for many
examples. It is possible to include the delimiter within the pattern by
escaping it, for example
/abc\/def/
If you do so, the escape and the delimiter form part of the pattern, but since
delimiters are always non-alphameric, this does not affect its interpretation.
If the terminating delimiter is immediately followed by a backslash, for
example,
/abc/\
then a backslash is added to the end of the pattern. This is done to provide a
way of testing the error condition that arises if a pattern finishes with a
backslash, because
/abc\/
is interpreted as the first line of a pattern that starts with "abc/", causing
pcretest to read the next line as a continuation of the regular expression.
PATTERN MODIFIERS
-----------------
The pattern may be followed by i, m, s, or x to set the PCRE_CASELESS,
PCRE_MULTILINE, PCRE_DOTALL, or PCRE_EXTENDED options, respectively. For
example:
/caseless/i
These modifier letters have the same effect as they do in Perl. There are
others which set PCRE options that do not correspond to anything in Perl: /A,
/E, and /X set PCRE_ANCHORED, PCRE_DOLLAR_ENDONLY, and PCRE_EXTRA respectively.
Searching for all possible matches within each subject string can be requested
by the /g or /G modifier. After finding a match, PCRE is called again to search
the remainder of the subject string. The difference between /g and /G is that
the former uses the startoffset argument to pcre_exec() to start searching at
a new point within the entire string (which is in effect what Perl does),
whereas the latter passes over a shortened substring. This makes a difference
to the matching process if the pattern begins with a lookbehind assertion
(including \b or \B).
If any call to pcre_exec() in a /g or /G sequence matches an empty string, the
next call is done with the PCRE_NOTEMPTY and PCRE_ANCHORED flags set in order
to search for another, non-empty, match at the same point. If this second match
fails, the start offset is advanced by one, and the normal match is retried.
This imitates the way Perl handles such cases when using the /g modifier or the
split() function.
There are a number of other modifiers for controlling the way pcretest
operates.
The /+ modifier requests that as well as outputting the substring that matched
the entire pattern, pcretest should in addition output the remainder of the
subject string. This is useful for tests where the subject contains multiple
copies of the same substring.
The /L modifier must be followed directly by the name of a locale, for example,
/pattern/Lfr
For this reason, it must be the last modifier letter. The given locale is set,
pcre_maketables() is called to build a set of character tables for the locale,
and this is then passed to pcre_compile() when compiling the regular
expression. Without an /L modifier, NULL is passed as the tables pointer; that
is, /L applies only to the expression on which it appears.
The /I modifier requests that pcretest output information about the compiled
expression (whether it is anchored, has a fixed first character, and so on). It
does this by calling pcre_fullinfo() after compiling an expression, and
outputting the information it gets back. If the pattern is studied, the results
of that are also output.
The /D modifier is a PCRE debugging feature, which also assumes /I. It causes
the internal form of compiled regular expressions to be output after
compilation.
The /S modifier causes pcre_study() to be called after the expression has been
compiled, and the results used when the expression is matched.
The /M modifier causes the size of memory block used to hold the compiled
pattern to be output.
The /P modifier causes pcretest to call PCRE via the POSIX wrapper API rather
than its native API. When this is done, all other modifiers except /i, /m, and
/+ are ignored. REG_ICASE is set if /i is present, and REG_NEWLINE is set if /m
is present. The wrapper functions force PCRE_DOLLAR_ENDONLY always, and
PCRE_DOTALL unless REG_NEWLINE is set.
The /8 modifier causes pcretest to call PCRE with the PCRE_UTF8 option set.
This turns on the (currently incomplete) support for UTF-8 character handling
in PCRE, provided that it was compiled with this support enabled. This modifier
also causes any non-printing characters in output strings to be printed using
the \x{hh...} notation if they are valid UTF-8 sequences.
DATA LINES
----------
Before each data line is passed to pcre_exec(), leading and trailing whitespace
is removed, and it is then scanned for \ escapes. The following are recognized:
\a alarm (= BEL)
\b backspace
\e escape
\f formfeed
\n newline
\r carriage return
\t tab
\v vertical tab
\nnn octal character (up to 3 octal digits)
\xhh hexadecimal character (up to 2 hex digits)
\x{hh...} hexadecimal UTF-8 character
\A pass the PCRE_ANCHORED option to pcre_exec()
\B pass the PCRE_NOTBOL option to pcre_exec()
\Cdd call pcre_copy_substring() for substring dd after a successful
match (any decimal number less than 32)
\Gdd call pcre_get_substring() for substring dd after a successful
match (any decimal number less than 32)
\L call pcre_get_substringlist() after a successful match
\N pass the PCRE_NOTEMPTY option to pcre_exec()
\Odd set the size of the output vector passed to pcre_exec() to dd
(any number of decimal digits)
\Z pass the PCRE_NOTEOL option to pcre_exec()
A backslash followed by anything else just escapes the anything else. If the
very last character is a backslash, it is ignored. This gives a way of passing
an empty line as data, since a real empty line terminates the data input.
If /P was present on the regex, causing the POSIX wrapper API to be used, only
\B, and \Z have any effect, causing REG_NOTBOL and REG_NOTEOL to be passed to
regexec() respectively.
The use of \x{hh...} to represent UTF-8 characters is not dependent on the use
of the /8 modifier on the pattern. It is recognized always. There may be any
number of hexadecimal digits inside the braces. The result is from one to six
bytes, encoded according to the UTF-8 rules.
OUTPUT FROM PCRETEST
--------------------
When a match succeeds, pcretest outputs the list of captured substrings that
pcre_exec() returns, starting with number 0 for the string that matched the
whole pattern. Here is an example of an interactive pcretest run.
$ pcretest
PCRE version 2.06 08-Jun-1999
re> /^abc(\d+)/
data> abc123
0: abc123
1: 123
data> xyz
No match
If the strings contain any non-printing characters, they are output as \0x
escapes, or as \x{...} escapes if the /8 modifier was present on the pattern.
If the pattern has the /+ modifier, then the output for substring 0 is followed
by the the rest of the subject string, identified by "0+" like this:
re> /cat/+
data> cataract
0: cat
0+ aract
If the pattern has the /g or /G modifier, the results of successive matching
attempts are output in sequence, like this:
re> /\Bi(\w\w)/g
data> Mississippi
0: iss
1: ss
0: iss
1: ss
0: ipp
1: pp
"No match" is output only if the first match attempt fails.
If any of \C, \G, or \L are present in a data line that is successfully
matched, the substrings extracted by the convenience functions are output with
C, G, or L after the string number instead of a colon. This is in addition to
the normal full list. The string length (that is, the return from the
extraction function) is given in parentheses after each string for \C and \G.
Note that while patterns can be continued over several lines (a plain ">"
prompt is used for continuations), data lines may not. However newlines can be
included in data by means of the \n escape.
COMMAND LINE OPTIONS
--------------------
If the -p option is given to pcretest, it is equivalent to adding /P to each
regular expression: the POSIX wrapper API is used to call PCRE. None of the
following flags has any effect in this case.
If the option -d is given to pcretest, it is equivalent to adding /D to each
regular expression: the internal form is output after compilation.
If the option -i is given to pcretest, it is equivalent to adding /I to each
regular expression: information about the compiled pattern is given after
compilation.
If the option -m is given to pcretest, it outputs the size of each compiled
pattern after it has been compiled. It is equivalent to adding /M to each
regular expression. For compatibility with earlier versions of pcretest, -s is
a synonym for -m.
If the -t option is given, each compile, study, and match is run 20000 times
while being timed, and the resulting time per compile or match is output in
milliseconds. Do not set -t with -m, because you will then get the size output
20000 times and the timing will be distorted. If you want to change the number
of repetitions used for timing, edit the definition of LOOPREPEAT at the top of
pcretest.c
Philip Hazel
August 2000
privoxy-3.0.21-stable/./pcre/doc/pcregrep.1 000640 001751 001751 00000005022 10546014100 017337 0 ustar 00fk fk 000000 000000 .TH PCREGREP 1
.SH NAME
pcregrep - a grep with Perl-compatible regular expressions.
.SH SYNOPSIS
.B pcregrep [-Vchilnsvx] pattern [file] ...
.SH DESCRIPTION
\fBpcregrep\fR searches files for character patterns, in the same way as other
grep commands do, but it uses the PCRE regular expression library to support
patterns that are compatible with the regular expressions of Perl 5. See
\fBpcre(3)\fR for a full description of syntax and semantics.
If no files are specified, \fBpcregrep\fR reads the standard input. By default,
each line that matches the pattern is copied to the standard output, and if
there is more than one file, the file name is printed before each line of
output. However, there are options that can change how \fBpcregrep\fR behaves.
Lines are limited to BUFSIZ characters. BUFSIZ is defined in \fB\fR.
The newline character is removed from the end of each line before it is matched
against the pattern.
.SH OPTIONS
.TP 10
\fB-V\fR
Write the version number of the PCRE library being used to the standard error
stream.
.TP
\fB-c\fR
Do not print individual lines; instead just print a count of the number of
lines that would otherwise have been printed. If several files are given, a
count is printed for each of them.
.TP
\fB-h\fR
Suppress printing of filenames when searching multiple files.
.TP
\fB-i\fR
Ignore upper/lower case distinctions during comparisons.
.TP
\fB-l\fR
Instead of printing lines from the files, just print the names of the files
containing lines that would have been printed. Each file name is printed
once, on a separate line.
.TP
\fB-n\fR
Precede each line by its line number in the file.
.TP
\fB-s\fR
Work silently, that is, display nothing except error messages.
The exit status indicates whether any matches were found.
.TP
\fB-v\fR
Invert the sense of the match, so that lines which do \fInot\fR match the
pattern are now the ones that are found.
.TP
\fB-x\fR
Force the pattern to be anchored (it must start matching at the beginning of
the line) and in addition, require it to match the entire line. This is
equivalent to having ^ and $ characters at the start and end of each
alternative branch in the regular expression.
.SH SEE ALSO
\fBpcre(3)\fR, Perl 5 documentation
.SH DIAGNOSTICS
Exit status is 0 if any matches were found, 1 if no matches were found, and 2
for syntax errors or inacessible files (even if matches were found).
.SH AUTHOR
Philip Hazel
.br
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/Tech.Notes 000640 001751 001751 00000022731 10546014077 017366 0 ustar 00fk fk 000000 000000 Technical Notes about PCRE
--------------------------
Many years ago I implemented some regular expression functions to an algorithm
suggested by Martin Richards. These were not Unix-like in form, and were quite
restricted in what they could do by comparison with Perl. The interesting part
about the algorithm was that the amount of space required to hold the compiled
form of an expression was known in advance. The code to apply an expression did
not operate by backtracking, as the Henry Spencer and Perl code does, but
instead checked all possibilities simultaneously by keeping a list of current
states and checking all of them as it advanced through the subject string. (In
the terminology of Jeffrey Friedl's book, it was a "DFA algorithm".) When the
pattern was all used up, all remaining states were possible matches, and the
one matching the longest subset of the subject string was chosen. This did not
necessarily maximize the individual wild portions of the pattern, as is
expected in Unix and Perl-style regular expressions.
By contrast, the code originally written by Henry Spencer and subsequently
heavily modified for Perl actually compiles the expression twice: once in a
dummy mode in order to find out how much store will be needed, and then for
real. The execution function operates by backtracking and maximizing (or,
optionally, minimizing in Perl) the amount of the subject that matches
individual wild portions of the pattern. This is an "NFA algorithm" in Friedl's
terminology.
For the set of functions that forms PCRE (which are unrelated to those
mentioned above), I tried at first to invent an algorithm that used an amount
of store bounded by a multiple of the number of characters in the pattern, to
save on compiling time. However, because of the greater complexity in Perl
regular expressions, I couldn't do this. In any case, a first pass through the
pattern is needed, in order to find internal flag settings like (?i) at top
level. So PCRE works by running a very degenerate first pass to calculate a
maximum store size, and then a second pass to do the real compile - which may
use a bit less than the predicted amount of store. The idea is that this is
going to turn out faster because the first pass is degenerate and the second
pass can just store stuff straight into the vector. It does make the compiling
functions bigger, of course, but they have got quite big anyway to handle all
the Perl stuff.
The compiled form of a pattern is a vector of bytes, containing items of
variable length. The first byte in an item is an opcode, and the length of the
item is either implicit in the opcode or contained in the data bytes which
follow it. A list of all the opcodes follows:
Opcodes with no following data
------------------------------
These items are all just one byte long
OP_END end of pattern
OP_ANY match any character
OP_SOD match start of data: \A
OP_CIRC ^ (start of data, or after \n in multiline)
OP_NOT_WORD_BOUNDARY \W
OP_WORD_BOUNDARY \w
OP_NOT_DIGIT \D
OP_DIGIT \d
OP_NOT_WHITESPACE \S
OP_WHITESPACE \s
OP_NOT_WORDCHAR \W
OP_WORDCHAR \w
OP_EODN match end of data or \n at end: \Z
OP_EOD match end of data: \z
OP_DOLL $ (end of data, or before \n in multiline)
OP_RECURSE match the pattern recursively
Repeating single characters
---------------------------
The common repeats (*, +, ?) when applied to a single character appear as
two-byte items using the following opcodes:
OP_STAR
OP_MINSTAR
OP_PLUS
OP_MINPLUS
OP_QUERY
OP_MINQUERY
Those with "MIN" in their name are the minimizing versions. Each is followed by
the character that is to be repeated. Other repeats make use of
OP_UPTO
OP_MINUPTO
OP_EXACT
which are followed by a two-byte count (most significant first) and the
repeated character. OP_UPTO matches from 0 to the given number. A repeat with a
non-zero minimum and a fixed maximum is coded as an OP_EXACT followed by an
OP_UPTO (or OP_MINUPTO).
Repeating character types
-------------------------
Repeats of things like \d are done exactly as for single characters, except
that instead of a character, the opcode for the type is stored in the data
byte. The opcodes are:
OP_TYPESTAR
OP_TYPEMINSTAR
OP_TYPEPLUS
OP_TYPEMINPLUS
OP_TYPEQUERY
OP_TYPEMINQUERY
OP_TYPEUPTO
OP_TYPEMINUPTO
OP_TYPEEXACT
Matching a character string
---------------------------
The OP_CHARS opcode is followed by a one-byte count and then that number of
characters. If there are more than 255 characters in sequence, successive
instances of OP_CHARS are used.
Character classes
-----------------
OP_CLASS is used for a character class, provided there are at least two
characters in the class. If there is only one character, OP_CHARS is used for a
positive class, and OP_NOT for a negative one (that is, for something like
[^a]). Another set of repeating opcodes (OP_NOTSTAR etc.) are used for a
repeated, negated, single-character class. The normal ones (OP_STAR etc.) are
used for a repeated positive single-character class.
OP_CLASS is followed by a 32-byte bit map containing a 1 bit for every
character that is acceptable. The bits are counted from the least significant
end of each byte.
Back references
---------------
OP_REF is followed by a single byte containing the reference number.
Repeating character classes and back references
-----------------------------------------------
Single-character classes are handled specially (see above). This applies to
OP_CLASS and OP_REF. In both cases, the repeat information follows the base
item. The matching code looks at the following opcode to see if it is one of
OP_CRSTAR
OP_CRMINSTAR
OP_CRPLUS
OP_CRMINPLUS
OP_CRQUERY
OP_CRMINQUERY
OP_CRRANGE
OP_CRMINRANGE
All but the last two are just single-byte items. The others are followed by
four bytes of data, comprising the minimum and maximum repeat counts.
Brackets and alternation
------------------------
A pair of non-capturing (round) brackets is wrapped round each expression at
compile time, so alternation always happens in the context of brackets.
Non-capturing brackets use the opcode OP_BRA, while capturing brackets use
OP_BRA+1, OP_BRA+2, etc. [Note for North Americans: "bracket" to some English
speakers, including myself, can be round, square, curly, or pointy. Hence this
usage.]
A bracket opcode is followed by two bytes which give the offset to the next
alternative OP_ALT or, if there aren't any branches, to the matching KET
opcode. Each OP_ALT is followed by two bytes giving the offset to the next one,
or to the KET opcode.
OP_KET is used for subpatterns that do not repeat indefinitely, while
OP_KETRMIN and OP_KETRMAX are used for indefinite repetitions, minimally or
maximally respectively. All three are followed by two bytes giving (as a
positive number) the offset back to the matching BRA opcode.
If a subpattern is quantified such that it is permitted to match zero times, it
is preceded by one of OP_BRAZERO or OP_BRAMINZERO. These are single-byte
opcodes which tell the matcher that skipping this subpattern entirely is a
valid branch.
A subpattern with an indefinite maximum repetition is replicated in the
compiled data its minimum number of times (or once with a BRAZERO if the
minimum is zero), with the final copy terminating with a KETRMIN or KETRMAX as
appropriate.
A subpattern with a bounded maximum repetition is replicated in a nested
fashion up to the maximum number of times, with BRAZERO or BRAMINZERO before
each replication after the minimum, so that, for example, (abc){2,5} is
compiled as (abc)(abc)((abc)((abc)(abc)?)?)?. The 200-bracket limit does not
apply to these internally generated brackets.
Assertions
----------
Forward assertions are just like other subpatterns, but starting with one of
the opcodes OP_ASSERT or OP_ASSERT_NOT. Backward assertions use the opcodes
OP_ASSERTBACK and OP_ASSERTBACK_NOT, and the first opcode inside the assertion
is OP_REVERSE, followed by a two byte count of the number of characters to move
back the pointer in the subject string. When operating in UTF-8 mode, the count
is a character count rather than a byte count. A separate count is present in
each alternative of a lookbehind assertion, allowing them to have different
fixed lengths.
Once-only subpatterns
---------------------
These are also just like other subpatterns, but they start with the opcode
OP_ONCE.
Conditional subpatterns
-----------------------
These are like other subpatterns, but they start with the opcode OP_COND. If
the condition is a back reference, this is stored at the start of the
subpattern using the opcode OP_CREF followed by one byte containing the
reference number. Otherwise, a conditional subpattern will always start with
one of the assertions.
Changing options
----------------
If any of the /i, /m, or /s options are changed within a parenthesized group,
an OP_OPT opcode is compiled, followed by one byte containing the new settings
of these flags. If there are several alternatives in a group, there is an
occurrence of OP_OPT at the start of all those following the first options
change, to set appropriate options for the start of the alternative.
Immediately after the end of the group there is another such item to reset the
flags to their previous values. Other changes of flag within the pattern can be
handled entirely at compile time, and so do not cause anything to be put into
the compiled data.
Philip Hazel
August 2000
privoxy-3.0.21-stable/./pcre/doc/pcre.txt 000640 001751 001751 00000266117 10546014100 017156 0 ustar 00fk fk 000000 000000 NAME
pcre - Perl-compatible regular expressions.
SYNOPSIS
#include
pcre *pcre_compile(const char *pattern, int options,
const char **errptr, int *erroffset,
const unsigned char *tableptr);
pcre_extra *pcre_study(const pcre *code, int options,
const char **errptr);
int pcre_exec(const pcre *code, const pcre_extra *extra,
const char *subject, int length, int startoffset,
int options, int *ovector, int ovecsize);
int pcre_copy_substring(const char *subject, int *ovector,
int stringcount, int stringnumber, char *buffer,
int buffersize);
int pcre_get_substring(const char *subject, int *ovector,
int stringcount, int stringnumber,
const char **stringptr);
int pcre_get_substring_list(const char *subject,
int *ovector, int stringcount, const char ***listptr);
void pcre_free_substring(const char *stringptr);
void pcre_free_substring_list(const char **stringptr);
const unsigned char *pcre_maketables(void);
int pcre_fullinfo(const pcre *code, const pcre_extra *extra,
int what, void *where);
int pcre_info(const pcre *code, int *optptr, *firstcharptr);
char *pcre_version(void);
void *(*pcre_malloc)(size_t);
void (*pcre_free)(void *);
DESCRIPTION
The PCRE library is a set of functions that implement regu-
lar expression pattern matching using the same syntax and
semantics as Perl 5, with just a few differences (see
below). The current implementation corresponds to Perl
5.005, with some additional features from later versions.
This includes some experimental, incomplete support for
UTF-8 encoded strings. Details of exactly what is and what
is not supported are given below.
PCRE has its own native API, which is described in this
document. There is also a set of wrapper functions that
correspond to the POSIX regular expression API. These are
described in the pcreposix documentation.
The native API function prototypes are defined in the header
file pcre.h, and on Unix systems the library itself is
called libpcre.a, so can be accessed by adding -lpcre to the
command for linking an application which calls it. The
header file defines the macros PCRE_MAJOR and PCRE_MINOR to
contain the major and minor release numbers for the library.
Applications can use these to include support for different
releases.
The functions pcre_compile(), pcre_study(), and pcre_exec()
are used for compiling and matching regular expressions.
The functions pcre_copy_substring(), pcre_get_substring(),
and pcre_get_substring_list() are convenience functions for
extracting captured substrings from a matched subject
string; pcre_free_substring() and pcre_free_substring_list()
are also provided, to free the memory used for extracted
strings.
The function pcre_maketables() is used (optionally) to build
a set of character tables in the current locale for passing
to pcre_compile().
The function pcre_fullinfo() is used to find out information
about a compiled pattern; pcre_info() is an obsolete version
which returns only some of the available information, but is
retained for backwards compatibility. The function
pcre_version() returns a pointer to a string containing the
version of PCRE and its date of release.
The global variables pcre_malloc and pcre_free initially
contain the entry points of the standard malloc() and free()
functions respectively. PCRE calls the memory management
functions via these variables, so a calling program can
replace them if it wishes to intercept the calls. This
should be done before calling any PCRE functions.
MULTI-THREADING
The PCRE functions can be used in multi-threading
SunOS 5.8 Last change: 2
applications, with the proviso that the memory management
functions pointed to by pcre_malloc and pcre_free are shared
by all threads.
The compiled form of a regular expression is not altered
during matching, so the same compiled pattern can safely be
used by several threads at once.
COMPILING A PATTERN
The function pcre_compile() is called to compile a pattern
into an internal form. The pattern is a C string terminated
by a binary zero, and is passed in the argument pattern. A
pointer to a single block of memory that is obtained via
pcre_malloc is returned. This contains the compiled code and
related data. The pcre type is defined for this for conveni-
ence, but in fact pcre is just a typedef for void, since the
contents of the block are not externally defined. It is up
to the caller to free the memory when it is no longer
required.
The size of a compiled pattern is roughly proportional to
the length of the pattern string, except that each character
class (other than those containing just a single character,
negated or not) requires 33 bytes, and repeat quantifiers
with a minimum greater than one or a bounded maximum cause
the relevant portions of the compiled pattern to be repli-
cated.
The options argument contains independent bits that affect
the compilation. It should be zero if no options are
required. Some of the options, in particular, those that are
compatible with Perl, can also be set and unset from within
the pattern (see the detailed description of regular expres-
sions below). For these options, the contents of the options
argument specifies their initial settings at the start of
compilation and execution. The PCRE_ANCHORED option can be
set at the time of matching as well as at compile time.
If errptr is NULL, pcre_compile() returns NULL immediately.
Otherwise, if compilation of a pattern fails, pcre_compile()
returns NULL, and sets the variable pointed to by errptr to
point to a textual error message. The offset from the start
of the pattern to the character where the error was
discovered is placed in the variable pointed to by
erroffset, which must not be NULL. If it is, an immediate
error is given.
If the final argument, tableptr, is NULL, PCRE uses a
default set of character tables which are built when it is
compiled, using the default C locale. Otherwise, tableptr
must be the result of a call to pcre_maketables(). See the
section on locale support below.
The following option bits are defined in the header file:
PCRE_ANCHORED
If this bit is set, the pattern is forced to be "anchored",
that is, it is constrained to match only at the start of the
string which is being searched (the "subject string"). This
effect can also be achieved by appropriate constructs in the
pattern itself, which is the only way to do it in Perl.
PCRE_CASELESS
If this bit is set, letters in the pattern match both upper
and lower case letters. It is equivalent to Perl's /i
option.
PCRE_DOLLAR_ENDONLY
If this bit is set, a dollar metacharacter in the pattern
matches only at the end of the subject string. Without this
option, a dollar also matches immediately before the final
character if it is a newline (but not before any other new-
lines). The PCRE_DOLLAR_ENDONLY option is ignored if
PCRE_MULTILINE is set. There is no equivalent to this option
in Perl.
PCRE_DOTALL
If this bit is set, a dot metacharater in the pattern
matches all characters, including newlines. Without it, new-
lines are excluded. This option is equivalent to Perl's /s
option. A negative class such as [^a] always matches a new-
line character, independent of the setting of this option.
PCRE_EXTENDED
If this bit is set, whitespace data characters in the pat-
tern are totally ignored except when escaped or inside a
character class, and characters between an unescaped # out-
side a character class and the next newline character,
inclusive, are also ignored. This is equivalent to Perl's /x
option, and makes it possible to include comments inside
complicated patterns. Note, however, that this applies only
to data characters. Whitespace characters may never appear
within special character sequences in a pattern, for example
within the sequence (?( which introduces a conditional sub-
pattern.
PCRE_EXTRA
This option was invented in order to turn on additional
functionality of PCRE that is incompatible with Perl, but it
is currently of very little use. When set, any backslash in
a pattern that is followed by a letter that has no special
meaning causes an error, thus reserving these combinations
for future expansion. By default, as in Perl, a backslash
followed by a letter with no special meaning is treated as a
literal. There are at present no other features controlled
by this option. It can also be set by a (?X) option setting
within a pattern.
PCRE_MULTILINE
By default, PCRE treats the subject string as consisting of
a single "line" of characters (even if it actually contains
several newlines). The "start of line" metacharacter (^)
matches only at the start of the string, while the "end of
line" metacharacter ($) matches only at the end of the
string, or before a terminating newline (unless
PCRE_DOLLAR_ENDONLY is set). This is the same as Perl.
When PCRE_MULTILINE it is set, the "start of line" and "end
of line" constructs match immediately following or immedi-
ately before any newline in the subject string, respec-
tively, as well as at the very start and end. This is
equivalent to Perl's /m option. If there are no "\n" charac-
ters in a subject string, or no occurrences of ^ or $ in a
pattern, setting PCRE_MULTILINE has no effect.
PCRE_UNGREEDY
This option inverts the "greediness" of the quantifiers so
that they are not greedy by default, but become greedy if
followed by "?". It is not compatible with Perl. It can also
be set by a (?U) option setting within the pattern.
PCRE_UTF8
This option causes PCRE to regard both the pattern and the
subject as strings of UTF-8 characters instead of just byte
strings. However, it is available only if PCRE has been
built to include UTF-8 support. If not, the use of this
option provokes an error. Support for UTF-8 is new, experi-
mental, and incomplete. Details of exactly what it entails
are given below.
STUDYING A PATTERN
When a pattern is going to be used several times, it is
worth spending more time analyzing it in order to speed up
the time taken for matching. The function pcre_study() takes
a pointer to a compiled pattern as its first argument, and
returns a pointer to a pcre_extra block (another void
typedef) containing additional information about the pat-
tern; this can be passed to pcre_exec(). If no additional
information is available, NULL is returned.
The second argument contains option bits. At present, no
options are defined for pcre_study(), and this argument
should always be zero.
The third argument for pcre_study() is a pointer to an error
message. If studying succeeds (even if no data is returned),
the variable it points to is set to NULL. Otherwise it
points to a textual error message.
At present, studying a pattern is useful only for non-
anchored patterns that do not have a single fixed starting
character. A bitmap of possible starting characters is
created.
LOCALE SUPPORT
PCRE handles caseless matching, and determines whether char-
acters are letters, digits, or whatever, by reference to a
set of tables. The library contains a default set of tables
which is created in the default C locale when PCRE is com-
piled. This is used when the final argument of
pcre_compile() is NULL, and is sufficient for many applica-
tions.
An alternative set of tables can, however, be supplied. Such
tables are built by calling the pcre_maketables() function,
which has no arguments, in the relevant locale. The result
can then be passed to pcre_compile() as often as necessary.
For example, to build and use tables that are appropriate
for the French locale (where accented characters with codes
greater than 128 are treated as letters), the following code
could be used:
setlocale(LC_CTYPE, "fr");
tables = pcre_maketables();
re = pcre_compile(..., tables);
The tables are built in memory that is obtained via
pcre_malloc. The pointer that is passed to pcre_compile is
saved with the compiled pattern, and the same tables are
used via this pointer by pcre_study() and pcre_exec(). Thus
for any single pattern, compilation, studying and matching
all happen in the same locale, but different patterns can be
compiled in different locales. It is the caller's responsi-
bility to ensure that the memory containing the tables
remains available for as long as it is needed.
INFORMATION ABOUT A PATTERN
The pcre_fullinfo() function returns information about a
compiled pattern. It replaces the obsolete pcre_info() func-
tion, which is nevertheless retained for backwards compabil-
ity (and is documented below).
The first argument for pcre_fullinfo() is a pointer to the
compiled pattern. The second argument is the result of
pcre_study(), or NULL if the pattern was not studied. The
third argument specifies which piece of information is
required, while the fourth argument is a pointer to a vari-
able to receive the data. The yield of the function is zero
for success, or one of the following negative numbers:
PCRE_ERROR_NULL the argument code was NULL
the argument where was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
PCRE_ERROR_BADOPTION the value of what was invalid
The possible values for the third argument are defined in
pcre.h, and are as follows:
PCRE_INFO_OPTIONS
Return a copy of the options with which the pattern was com-
piled. The fourth argument should point to au unsigned long
int variable. These option bits are those specified in the
call to pcre_compile(), modified by any top-level option
settings within the pattern itself, and with the
PCRE_ANCHORED bit forcibly set if the form of the pattern
implies that it can match only at the start of a subject
string.
PCRE_INFO_SIZE
Return the size of the compiled pattern, that is, the value
that was passed as the argument to pcre_malloc() when PCRE
was getting memory in which to place the compiled data. The
fourth argument should point to a size_t variable.
PCRE_INFO_CAPTURECOUNT
Return the number of capturing subpatterns in the pattern.
The fourth argument should point to an int variable.
PCRE_INFO_BACKREFMAX
Return the number of the highest back reference in the
pattern. The fourth argument should point to an int vari-
able. Zero is returned if there are no back references.
PCRE_INFO_FIRSTCHAR
Return information about the first character of any matched
string, for a non-anchored pattern. If there is a fixed
first character, e.g. from a pattern such as
(cat|cow|coyote), it is returned in the integer pointed to
by where. Otherwise, if either
(a) the pattern was compiled with the PCRE_MULTILINE option,
and every branch starts with "^", or
(b) every branch of the pattern starts with ".*" and
PCRE_DOTALL is not set (if it were set, the pattern would be
anchored),
-1 is returned, indicating that the pattern matches only at
the start of a subject string or after any "\n" within the
string. Otherwise -2 is returned. For anchored patterns, -2
is returned.
PCRE_INFO_FIRSTTABLE
If the pattern was studied, and this resulted in the con-
struction of a 256-bit table indicating a fixed set of char-
acters for the first character in any matching string, a
pointer to the table is returned. Otherwise NULL is
returned. The fourth argument should point to an unsigned
char * variable.
PCRE_INFO_LASTLITERAL
For a non-anchored pattern, return the value of the right-
most literal character which must exist in any matched
string, other than at its start. The fourth argument should
point to an int variable. If there is no such character, or
if the pattern is anchored, -1 is returned. For example, for
the pattern /a\d+z\d+/ the returned value is 'z'.
The pcre_info() function is now obsolete because its inter-
face is too restrictive to return all the available data
about a compiled pattern. New programs should use
pcre_fullinfo() instead. The yield of pcre_info() is the
number of capturing subpatterns, or one of the following
negative numbers:
PCRE_ERROR_NULL the argument code was NULL
PCRE_ERROR_BADMAGIC the "magic number" was not found
If the optptr argument is not NULL, a copy of the options
with which the pattern was compiled is placed in the integer
it points to (see PCRE_INFO_OPTIONS above).
If the pattern is not anchored and the firstcharptr argument
is not NULL, it is used to pass back information about the
first character of any matched string (see
PCRE_INFO_FIRSTCHAR above).
MATCHING A PATTERN
The function pcre_exec() is called to match a subject string
against a pre-compiled pattern, which is passed in the code
argument. If the pattern has been studied, the result of the
study should be passed in the extra argument. Otherwise this
must be NULL.
The PCRE_ANCHORED option can be passed in the options argu-
ment, whose unused bits must be zero. However, if a pattern
was compiled with PCRE_ANCHORED, or turned out to be
anchored by virtue of its contents, it cannot be made
unachored at matching time.
There are also three further options that can be set only at
matching time:
PCRE_NOTBOL
The first character of the string is not the beginning of a
line, so the circumflex metacharacter should not match
before it. Setting this without PCRE_MULTILINE (at compile
time) causes circumflex never to match.
PCRE_NOTEOL
The end of the string is not the end of a line, so the dol-
lar metacharacter should not match it nor (except in multi-
line mode) a newline immediately before it. Setting this
without PCRE_MULTILINE (at compile time) causes dollar never
to match.
PCRE_NOTEMPTY
An empty string is not considered to be a valid match if
this option is set. If there are alternatives in the pat-
tern, they are tried. If all the alternatives match the
empty string, the entire match fails. For example, if the
pattern
a?b?
is applied to a string not beginning with "a" or "b", it
matches the empty string at the start of the subject. With
PCRE_NOTEMPTY set, this match is not valid, so PCRE searches
further into the string for occurrences of "a" or "b".
Perl has no direct equivalent of PCRE_NOTEMPTY, but it does
make a special case of a pattern match of the empty string
within its split() function, and when using the /g modifier.
It is possible to emulate Perl's behaviour after matching a
null string by first trying the match again at the same
offset with PCRE_NOTEMPTY set, and then if that fails by
advancing the starting offset (see below) and trying an
ordinary match again.
The subject string is passed as a pointer in subject, a
length in length, and a starting offset in startoffset.
Unlike the pattern string, it may contain binary zero char-
acters. When the starting offset is zero, the search for a
match starts at the beginning of the subject, and this is by
far the most common case.
A non-zero starting offset is useful when searching for
another match in the same subject by calling pcre_exec()
again after a previous success. Setting startoffset differs
from just passing over a shortened string and setting
PCRE_NOTBOL in the case of a pattern that begins with any
kind of lookbehind. For example, consider the pattern
\Biss\B
which finds occurrences of "iss" in the middle of words. (\B
matches only if the current position in the subject is not a
word boundary.) When applied to the string "Mississipi" the
first call to pcre_exec() finds the first occurrence. If
pcre_exec() is called again with just the remainder of the
subject, namely "issipi", it does not match, because \B is
always false at the start of the subject, which is deemed to
be a word boundary. However, if pcre_exec() is passed the
entire string again, but with startoffset set to 4, it finds
the second occurrence of "iss" because it is able to look
behind the starting point to discover that it is preceded by
a letter.
If a non-zero starting offset is passed when the pattern is
anchored, one attempt to match at the given offset is tried.
This can only succeed if the pattern does not require the
match to be at the start of the subject.
In general, a pattern matches a certain portion of the sub-
ject, and in addition, further substrings from the subject
may be picked out by parts of the pattern. Following the
usage in Jeffrey Friedl's book, this is called "capturing"
in what follows, and the phrase "capturing subpattern" is
used for a fragment of a pattern that picks out a substring.
PCRE supports several other kinds of parenthesized subpat-
tern that do not cause substrings to be captured.
Captured substrings are returned to the caller via a vector
of integer offsets whose address is passed in ovector. The
number of elements in the vector is passed in ovecsize. The
first two-thirds of the vector is used to pass back captured
substrings, each substring using a pair of integers. The
remaining third of the vector is used as workspace by
pcre_exec() while matching capturing subpatterns, and is not
available for passing back information. The length passed in
ovecsize should always be a multiple of three. If it is not,
it is rounded down.
When a match has been successful, information about captured
substrings is returned in pairs of integers, starting at the
beginning of ovector, and continuing up to two-thirds of its
length at the most. The first element of a pair is set to
the offset of the first character in a substring, and the
second is set to the offset of the first character after the
end of a substring. The first pair, ovector[0] and ovec-
tor[1], identify the portion of the subject string matched
by the entire pattern. The next pair is used for the first
capturing subpattern, and so on. The value returned by
pcre_exec() is the number of pairs that have been set. If
there are no capturing subpatterns, the return value from a
successful match is 1, indicating that just the first pair
of offsets has been set.
Some convenience functions are provided for extracting the
captured substrings as separate strings. These are described
in the following section.
It is possible for an capturing subpattern number n+1 to
match some part of the subject when subpattern n has not
been used at all. For example, if the string "abc" is
matched against the pattern (a|(z))(bc) subpatterns 1 and 3
are matched, but 2 is not. When this happens, both offset
values corresponding to the unused subpattern are set to -1.
If a capturing subpattern is matched repeatedly, it is the
last portion of the string that it matched that gets
returned.
If the vector is too small to hold all the captured sub-
strings, it is used as far as possible (up to two-thirds of
its length), and the function returns a value of zero. In
particular, if the substring offsets are not of interest,
pcre_exec() may be called with ovector passed as NULL and
ovecsize as zero. However, if the pattern contains back
references and the ovector isn't big enough to remember the
related substrings, PCRE has to get additional memory for
use during matching. Thus it is usually advisable to supply
an ovector.
Note that pcre_info() can be used to find out how many cap-
turing subpatterns there are in a compiled pattern. The
smallest size for ovector that will allow for n captured
substrings in addition to the offsets of the substring
matched by the whole pattern is (n+1)*3.
If pcre_exec() fails, it returns a negative number. The fol-
lowing are defined in the header file:
PCRE_ERROR_NOMATCH (-1)
The subject string did not match the pattern.
PCRE_ERROR_NULL (-2)
Either code or subject was passed as NULL, or ovector was
NULL and ovecsize was not zero.
PCRE_ERROR_BADOPTION (-3)
An unrecognized bit was set in the options argument.
PCRE_ERROR_BADMAGIC (-4)
PCRE stores a 4-byte "magic number" at the start of the com-
piled code, to catch the case when it is passed a junk
pointer. This is the error it gives when the magic number
isn't present.
PCRE_ERROR_UNKNOWN_NODE (-5)
While running the pattern match, an unknown item was encoun-
tered in the compiled pattern. This error could be caused by
a bug in PCRE or by overwriting of the compiled pattern.
PCRE_ERROR_NOMEMORY (-6)
If a pattern contains back references, but the ovector that
is passed to pcre_exec() is not big enough to remember the
referenced substrings, PCRE gets a block of memory at the
start of matching to use for this purpose. If the call via
pcre_malloc() fails, this error is given. The memory is
freed at the end of matching.
EXTRACTING CAPTURED SUBSTRINGS
Captured substrings can be accessed directly by using the
SunOS 5.8 Last change: 12
offsets returned by pcre_exec() in ovector. For convenience,
the functions pcre_copy_substring(), pcre_get_substring(),
and pcre_get_substring_list() are provided for extracting
captured substrings as new, separate, zero-terminated
strings. A substring that contains a binary zero is
correctly extracted and has a further zero added on the end,
but the result does not, of course, function as a C string.
The first three arguments are the same for all three func-
tions: subject is the subject string which has just been
successfully matched, ovector is a pointer to the vector of
integer offsets that was passed to pcre_exec(), and
stringcount is the number of substrings that were captured
by the match, including the substring that matched the
entire regular expression. This is the value returned by
pcre_exec if it is greater than zero. If pcre_exec()
returned zero, indicating that it ran out of space in ovec-
tor, the value passed as stringcount should be the size of
the vector divided by three.
The functions pcre_copy_substring() and pcre_get_substring()
extract a single substring, whose number is given as string-
number. A value of zero extracts the substring that matched
the entire pattern, while higher values extract the captured
substrings. For pcre_copy_substring(), the string is placed
in buffer, whose length is given by buffersize, while for
pcre_get_substring() a new block of memory is obtained via
pcre_malloc, and its address is returned via stringptr. The
yield of the function is the length of the string, not
including the terminating zero, or one of
PCRE_ERROR_NOMEMORY (-6)
The buffer was too small for pcre_copy_substring(), or the
attempt to get memory failed for pcre_get_substring().
PCRE_ERROR_NOSUBSTRING (-7)
There is no substring whose number is stringnumber.
The pcre_get_substring_list() function extracts all avail-
able substrings and builds a list of pointers to them. All
this is done in a single block of memory which is obtained
via pcre_malloc. The address of the memory block is returned
via listptr, which is also the start of the list of string
pointers. The end of the list is marked by a NULL pointer.
The yield of the function is zero if all went well, or
PCRE_ERROR_NOMEMORY (-6)
if the attempt to get the memory block failed.
When any of these functions encounter a substring that is
unset, which can happen when capturing subpattern number n+1
matches some part of the subject, but subpattern n has not
been used at all, they return an empty string. This can be
distinguished from a genuine zero-length substring by
inspecting the appropriate offset in ovector, which is nega-
tive for unset substrings.
The two convenience functions pcre_free_substring() and
pcre_free_substring_list() can be used to free the memory
returned by a previous call of pcre_get_substring() or
pcre_get_substring_list(), respectively. They do nothing
more than call the function pointed to by pcre_free, which
of course could be called directly from a C program. How-
ever, PCRE is used in some situations where it is linked via
a special interface to another programming language which
cannot use pcre_free directly; it is for these cases that
the functions are provided.
LIMITATIONS
There are some size limitations in PCRE but it is hoped that
they will never in practice be relevant. The maximum length
of a compiled pattern is 65539 (sic) bytes. All values in
repeating quantifiers must be less than 65536. The maximum
number of capturing subpatterns is 99. The maximum number
of all parenthesized subpatterns, including capturing sub-
patterns, assertions, and other types of subpattern, is 200.
The maximum length of a subject string is the largest posi-
tive number that an integer variable can hold. However, PCRE
uses recursion to handle subpatterns and indefinite repeti-
tion. This means that the available stack space may limit
the size of a subject string that can be processed by cer-
tain patterns.
DIFFERENCES FROM PERL
The differences described here are with respect to Perl
5.005.
1. By default, a whitespace character is any character that
the C library function isspace() recognizes, though it is
possible to compile PCRE with alternative character type
tables. Normally isspace() matches space, formfeed, newline,
carriage return, horizontal tab, and vertical tab. Perl 5 no
longer includes vertical tab in its set of whitespace char-
acters. The \v escape that was in the Perl documentation for
a long time was never in fact recognized. However, the char-
acter itself was treated as whitespace at least up to 5.002.
In 5.004 and 5.005 it does not match \s.
2. PCRE does not allow repeat quantifiers on lookahead
assertions. Perl permits them, but they do not mean what you
might think. For example, (?!a){3} does not assert that the
next three characters are not "a". It just asserts that the
next character is not "a" three times.
3. Capturing subpatterns that occur inside negative looka-
head assertions are counted, but their entries in the
offsets vector are never set. Perl sets its numerical vari-
ables from any such patterns that are matched before the
assertion fails to match something (thereby succeeding), but
only if the negative lookahead assertion contains just one
branch.
4. Though binary zero characters are supported in the sub-
ject string, they are not allowed in a pattern string
because it is passed as a normal C string, terminated by
zero. The escape sequence "\0" can be used in the pattern to
represent a binary zero.
5. The following Perl escape sequences are not supported:
\l, \u, \L, \U, \E, \Q. In fact these are implemented by
Perl's general string-handling and are not part of its pat-
tern matching engine.
6. The Perl \G assertion is not supported as it is not
relevant to single pattern matches.
7. Fairly obviously, PCRE does not support the (?{code}) and
(?p{code}) constructions. However, there is some experimen-
tal support for recursive patterns using the non-Perl item
(?R).
8. There are at the time of writing some oddities in Perl
5.005_02 concerned with the settings of captured strings
when part of a pattern is repeated. For example, matching
"aba" against the pattern /^(a(b)?)+$/ sets $2 to the value
"b", but matching "aabbaa" against /^(aa(bb)?)+$/ leaves $2
unset. However, if the pattern is changed to
/^(aa(b(b))?)+$/ then $2 (and $3) are set.
In Perl 5.004 $2 is set in both cases, and that is also true
of PCRE. If in the future Perl changes to a consistent state
that is different, PCRE may change to follow.
9. Another as yet unresolved discrepancy is that in Perl
5.005_02 the pattern /^(a)?(?(1)a|b)+$/ matches the string
"a", whereas in PCRE it does not. However, in both Perl and
PCRE /^(a)?a/ matched against "a" leaves $1 unset.
10. PCRE provides some extensions to the Perl regular
expression facilities:
(a) Although lookbehind assertions must match fixed length
strings, each alternative branch of a lookbehind assertion
can match a different length of string. Perl 5.005 requires
them all to have the same length.
(b) If PCRE_DOLLAR_ENDONLY is set and PCRE_MULTILINE is not
set, the $ meta- character matches only at the very end of
the string.
(c) If PCRE_EXTRA is set, a backslash followed by a letter
with no special meaning is faulted.
(d) If PCRE_UNGREEDY is set, the greediness of the repeti-
tion quantifiers is inverted, that is, by default they are
not greedy, but if followed by a question mark they are.
(e) PCRE_ANCHORED can be used to force a pattern to be tried
only at the start of the subject.
(f) The PCRE_NOTBOL, PCRE_NOTEOL, and PCRE_NOTEMPTY options
for pcre_exec() have no Perl equivalents.
(g) The (?R) construct allows for recursive pattern matching
(Perl 5.6 can do this using the (?p{code}) construct, which
PCRE cannot of course support.)
REGULAR EXPRESSION DETAILS
The syntax and semantics of the regular expressions sup-
ported by PCRE are described below. Regular expressions are
also described in the Perl documentation and in a number of
other books, some of which have copious examples. Jeffrey
Friedl's "Mastering Regular Expressions", published by
O'Reilly (ISBN 1-56592-257), covers them in great detail.
The description here is intended as reference documentation.
The basic operation of PCRE is on strings of bytes. However,
there is the beginnings of some support for UTF-8 character
strings. To use this support you must configure PCRE to
include it, and then call pcre_compile() with the PCRE_UTF8
option. How this affects the pattern matching is described
in the final section of this document.
A regular expression is a pattern that is matched against a
subject string from left to right. Most characters stand for
themselves in a pattern, and match the corresponding charac-
ters in the subject. As a trivial example, the pattern
The quick brown fox
matches a portion of a subject string that is identical to
itself. The power of regular expressions comes from the
ability to include alternatives and repetitions in the pat-
tern. These are encoded in the pattern by the use of meta-
characters, which do not stand for themselves but instead
are interpreted in some special way.
There are two different sets of meta-characters: those that
are recognized anywhere in the pattern except within square
brackets, and those that are recognized in square brackets.
Outside square brackets, the meta-characters are as follows:
\ general escape character with several uses
^ assert start of subject (or line, in multiline
mode)
$ assert end of subject (or line, in multiline mode)
. match any character except newline (by default)
[ start character class definition
| start of alternative branch
( start subpattern
) end subpattern
? extends the meaning of (
also 0 or 1 quantifier
also quantifier minimizer
* 0 or more quantifier
+ 1 or more quantifier
{ start min/max quantifier
Part of a pattern that is in square brackets is called a
"character class". In a character class the only meta-
characters are:
\ general escape character
^ negate the class, but only if the first character
- indicates character range
] terminates the character class
The following sections describe the use of each of the
meta-characters.
BACKSLASH
The backslash character has several uses. Firstly, if it is
followed by a non-alphameric character, it takes away any
special meaning that character may have. This use of
backslash as an escape character applies both inside and
outside character classes.
For example, if you want to match a "*" character, you write
"\*" in the pattern. This applies whether or not the follow-
ing character would otherwise be interpreted as a meta-
character, so it is always safe to precede a non-alphameric
with "\" to specify that it stands for itself. In particu-
lar, if you want to match a backslash, you write "\\".
If a pattern is compiled with the PCRE_EXTENDED option, whi-
tespace in the pattern (other than in a character class) and
characters between a "#" outside a character class and the
next newline character are ignored. An escaping backslash
can be used to include a whitespace or "#" character as part
of the pattern.
A second use of backslash provides a way of encoding non-
printing characters in patterns in a visible manner. There
is no restriction on the appearance of non-printing charac-
ters, apart from the binary zero that terminates a pattern,
but when a pattern is being prepared by text editing, it is
usually easier to use one of the following escape sequences
than the binary character it represents:
\a alarm, that is, the BEL character (hex 07)
\cx "control-x", where x is any character
\e escape (hex 1B)
\f formfeed (hex 0C)
\n newline (hex 0A)
\r carriage return (hex 0D)
\t tab (hex 09)
\xhh character with hex code hh
\ddd character with octal code ddd, or backreference
The precise effect of "\cx" is as follows: if "x" is a lower
case letter, it is converted to upper case. Then bit 6 of
the character (hex 40) is inverted. Thus "\cz" becomes hex
1A, but "\c{" becomes hex 3B, while "\c;" becomes hex 7B.
After "\x", up to two hexadecimal digits are read (letters
can be in upper or lower case).
After "\0" up to two further octal digits are read. In both
cases, if there are fewer than two digits, just those that
are present are used. Thus the sequence "\0\x\07" specifies
two binary zeros followed by a BEL character. Make sure you
supply two digits after the initial zero if the character
that follows is itself an octal digit.
The handling of a backslash followed by a digit other than 0
is complicated. Outside a character class, PCRE reads it
and any following digits as a decimal number. If the number
is less than 10, or if there have been at least that many
previous capturing left parentheses in the expression, the
entire sequence is taken as a back reference. A description
of how this works is given later, following the discussion
of parenthesized subpatterns.
Inside a character class, or if the decimal number is
greater than 9 and there have not been that many capturing
subpatterns, PCRE re-reads up to three octal digits follow-
ing the backslash, and generates a single byte from the
least significant 8 bits of the value. Any subsequent digits
stand for themselves. For example:
\040 is another way of writing a space
\40 is the same, provided there are fewer than 40
previous capturing subpatterns
\7 is always a back reference
\11 might be a back reference, or another way of
writing a tab
\011 is always a tab
\0113 is a tab followed by the character "3"
\113 is the character with octal code 113 (since there
can be no more than 99 back references)
\377 is a byte consisting entirely of 1 bits
\81 is either a back reference, or a binary zero
followed by the two characters "8" and "1"
Note that octal values of 100 or greater must not be intro-
duced by a leading zero, because no more than three octal
digits are ever read.
All the sequences that define a single byte value can be
used both inside and outside character classes. In addition,
inside a character class, the sequence "\b" is interpreted
as the backspace character (hex 08). Outside a character
class it has a different meaning (see below).
The third use of backslash is for specifying generic charac-
ter types:
\d any decimal digit
\D any character that is not a decimal digit
\s any whitespace character
\S any character that is not a whitespace character
\w any "word" character
\W any "non-word" character
Each pair of escape sequences partitions the complete set of
characters into two disjoint sets. Any given character
matches one, and only one, of each pair.
A "word" character is any letter or digit or the underscore
character, that is, any character which can be part of a
Perl "word". The definition of letters and digits is con-
trolled by PCRE's character tables, and may vary if locale-
specific matching is taking place (see "Locale support"
above). For example, in the "fr" (French) locale, some char-
acter codes greater than 128 are used for accented letters,
and these are matched by \w.
These character type sequences can appear both inside and
outside character classes. They each match one character of
the appropriate type. If the current matching point is at
the end of the subject string, all of them fail, since there
is no character to match.
The fourth use of backslash is for certain simple asser-
tions. An assertion specifies a condition that has to be met
at a particular point in a match, without consuming any
characters from the subject string. The use of subpatterns
for more complicated assertions is described below. The
backslashed assertions are
\b word boundary
\B not a word boundary
\A start of subject (independent of multiline mode)
\Z end of subject or newline at end (independent of
multiline mode)
\z end of subject (independent of multiline mode)
These assertions may not appear in character classes (but
note that "\b" has a different meaning, namely the backspace
character, inside a character class).
A word boundary is a position in the subject string where
the current character and the previous character do not both
match \w or \W (i.e. one matches \w and the other matches
\W), or the start or end of the string if the first or last
character matches \w, respectively.
The \A, \Z, and \z assertions differ from the traditional
circumflex and dollar (described below) in that they only
ever match at the very start and end of the subject string,
whatever options are set. They are not affected by the
PCRE_NOTBOL or PCRE_NOTEOL options. If the startoffset argu-
ment of pcre_exec() is non-zero, \A can never match. The
difference between \Z and \z is that \Z matches before a
newline that is the last character of the string as well as
at the end of the string, whereas \z matches only at the
end.
CIRCUMFLEX AND DOLLAR
Outside a character class, in the default matching mode, the
circumflex character is an assertion which is true only if
the current matching point is at the start of the subject
string. If the startoffset argument of pcre_exec() is non-
zero, circumflex can never match. Inside a character class,
circumflex has an entirely different meaning (see below).
Circumflex need not be the first character of the pattern if
a number of alternatives are involved, but it should be the
first thing in each alternative in which it appears if the
pattern is ever to match that branch. If all possible alter-
natives start with a circumflex, that is, if the pattern is
constrained to match only at the start of the subject, it is
said to be an "anchored" pattern. (There are also other con-
structs that can cause a pattern to be anchored.)
A dollar character is an assertion which is true only if the
current matching point is at the end of the subject string,
or immediately before a newline character that is the last
character in the string (by default). Dollar need not be the
last character of the pattern if a number of alternatives
are involved, but it should be the last item in any branch
in which it appears. Dollar has no special meaning in a
character class.
The meaning of dollar can be changed so that it matches only
at the very end of the string, by setting the
PCRE_DOLLAR_ENDONLY option at compile or matching time. This
does not affect the \Z assertion.
The meanings of the circumflex and dollar characters are
changed if the PCRE_MULTILINE option is set. When this is
the case, they match immediately after and immediately
before an internal "\n" character, respectively, in addition
to matching at the start and end of the subject string. For
example, the pattern /^abc$/ matches the subject string
"def\nabc" in multiline mode, but not otherwise. Conse-
quently, patterns that are anchored in single line mode
because all branches start with "^" are not anchored in mul-
tiline mode, and a match for circumflex is possible when the
startoffset argument of pcre_exec() is non-zero. The
PCRE_DOLLAR_ENDONLY option is ignored if PCRE_MULTILINE is
set.
Note that the sequences \A, \Z, and \z can be used to match
the start and end of the subject in both modes, and if all
branches of a pattern start with \A is it always anchored,
whether PCRE_MULTILINE is set or not.
FULL STOP (PERIOD, DOT)
Outside a character class, a dot in the pattern matches any
one character in the subject, including a non-printing char-
acter, but not (by default) newline. If the PCRE_DOTALL
option is set, dots match newlines as well. The handling of
dot is entirely independent of the handling of circumflex
and dollar, the only relationship being that they both
involve newline characters. Dot has no special meaning in a
character class.
SQUARE BRACKETS
An opening square bracket introduces a character class, ter-
minated by a closing square bracket. A closing square
bracket on its own is not special. If a closing square
bracket is required as a member of the class, it should be
the first data character in the class (after an initial cir-
cumflex, if present) or escaped with a backslash.
A character class matches a single character in the subject;
the character must be in the set of characters defined by
the class, unless the first character in the class is a cir-
cumflex, in which case the subject character must not be in
the set defined by the class. If a circumflex is actually
required as a member of the class, ensure it is not the
first character, or escape it with a backslash.
For example, the character class [aeiou] matches any lower
case vowel, while [^aeiou] matches any character that is not
a lower case vowel. Note that a circumflex is just a con-
venient notation for specifying the characters which are in
the class by enumerating those that are not. It is not an
assertion: it still consumes a character from the subject
string, and fails if the current pointer is at the end of
the string.
When caseless matching is set, any letters in a class
represent both their upper case and lower case versions, so
for example, a caseless [aeiou] matches "A" as well as "a",
and a caseless [^aeiou] does not match "A", whereas a case-
ful version would.
The newline character is never treated in any special way in
character classes, whatever the setting of the PCRE_DOTALL
or PCRE_MULTILINE options is. A class such as [^a] will
always match a newline.
The minus (hyphen) character can be used to specify a range
of characters in a character class. For example, [d-m]
matches any letter between d and m, inclusive. If a minus
character is required in a class, it must be escaped with a
backslash or appear in a position where it cannot be inter-
preted as indicating a range, typically as the first or last
character in the class.
It is not possible to have the literal character "]" as the
end character of a range. A pattern such as [W-]46] is
interpreted as a class of two characters ("W" and "-") fol-
lowed by a literal string "46]", so it would match "W46]" or
"-46]". However, if the "]" is escaped with a backslash it
is interpreted as the end of range, so [W-\]46] is inter-
preted as a single class containing a range followed by two
separate characters. The octal or hexadecimal representation
of "]" can also be used to end a range.
Ranges operate in ASCII collating sequence. They can also be
used for characters specified numerically, for example
[\000-\037]. If a range that includes letters is used when
caseless matching is set, it matches the letters in either
case. For example, [W-c] is equivalent to [][\^_`wxyzabc],
matched caselessly, and if character tables for the "fr"
locale are in use, [\xc8-\xcb] matches accented E characters
in both cases.
The character types \d, \D, \s, \S, \w, and \W may also
appear in a character class, and add the characters that
they match to the class. For example, [\dABCDEF] matches any
hexadecimal digit. A circumflex can conveniently be used
with the upper case character types to specify a more res-
tricted set of characters than the matching lower case type.
For example, the class [^\W_] matches any letter or digit,
but not underscore.
All non-alphameric characters other than \, -, ^ (at the
start) and the terminating ] are non-special in character
classes, but it does no harm if they are escaped.
POSIX CHARACTER CLASSES
Perl 5.6 (not yet released at the time of writing) is going
to support the POSIX notation for character classes, which
uses names enclosed by [: and :] within the enclosing
square brackets. PCRE supports this notation. For example,
[01[:alpha:]%]
matches "0", "1", any alphabetic character, or "%". The sup-
ported class names are
alnum letters and digits
alpha letters
ascii character codes 0 - 127
cntrl control characters
digit decimal digits (same as \d)
graph printing characters, excluding space
lower lower case letters
print printing characters, including space
punct printing characters, excluding letters and digits
space white space (same as \s)
upper upper case letters
word "word" characters (same as \w)
xdigit hexadecimal digits
The names "ascii" and "word" are Perl extensions. Another
Perl extension is negation, which is indicated by a ^ char-
acter after the colon. For example,
[12[:^digit:]]
matches "1", "2", or any non-digit. PCRE (and Perl) also
recogize the POSIX syntax [.ch.] and [=ch=] where "ch" is a
"collating element", but these are not supported, and an
error is given if they are encountered.
VERTICAL BAR
Vertical bar characters are used to separate alternative
patterns. For example, the pattern
gilbert|sullivan
matches either "gilbert" or "sullivan". Any number of alter-
natives may appear, and an empty alternative is permitted
(matching the empty string). The matching process tries
each alternative in turn, from left to right, and the first
one that succeeds is used. If the alternatives are within a
subpattern (defined below), "succeeds" means matching the
rest of the main pattern as well as the alternative in the
subpattern.
INTERNAL OPTION SETTING
The settings of PCRE_CASELESS, PCRE_MULTILINE, PCRE_DOTALL,
and PCRE_EXTENDED can be changed from within the pattern by
a sequence of Perl option letters enclosed between "(?" and
")". The option letters are
i for PCRE_CASELESS
m for PCRE_MULTILINE
s for PCRE_DOTALL
x for PCRE_EXTENDED
For example, (?im) sets caseless, multiline matching. It is
also possible to unset these options by preceding the letter
with a hyphen, and a combined setting and unsetting such as
(?im-sx), which sets PCRE_CASELESS and PCRE_MULTILINE while
unsetting PCRE_DOTALL and PCRE_EXTENDED, is also permitted.
If a letter appears both before and after the hyphen, the
option is unset.
The scope of these option changes depends on where in the
pattern the setting occurs. For settings that are outside
any subpattern (defined below), the effect is the same as if
the options were set or unset at the start of matching. The
following patterns all behave in exactly the same way:
(?i)abc
a(?i)bc
ab(?i)c
abc(?i)
which in turn is the same as compiling the pattern abc with
PCRE_CASELESS set. In other words, such "top level" set-
tings apply to the whole pattern (unless there are other
changes inside subpatterns). If there is more than one set-
ting of the same option at top level, the rightmost setting
is used.
If an option change occurs inside a subpattern, the effect
is different. This is a change of behaviour in Perl 5.005.
An option change inside a subpattern affects only that part
of the subpattern that follows it, so
(a(?i)b)c
matches abc and aBc and no other strings (assuming
PCRE_CASELESS is not used). By this means, options can be
made to have different settings in different parts of the
pattern. Any changes made in one alternative do carry on
into subsequent branches within the same subpattern. For
example,
(a(?i)b|c)
matches "ab", "aB", "c", and "C", even though when matching
"C" the first branch is abandoned before the option setting.
This is because the effects of option settings happen at
compile time. There would be some very weird behaviour oth-
erwise.
The PCRE-specific options PCRE_UNGREEDY and PCRE_EXTRA can
be changed in the same way as the Perl-compatible options by
using the characters U and X respectively. The (?X) flag
setting is special in that it must always occur earlier in
the pattern than any of the additional features it turns on,
even when it is at top level. It is best put at the start.
SUBPATTERNS
Subpatterns are delimited by parentheses (round brackets),
which can be nested. Marking part of a pattern as a subpat-
tern does two things:
1. It localizes a set of alternatives. For example, the pat-
tern
cat(aract|erpillar|)
matches one of the words "cat", "cataract", or "caterpil-
lar". Without the parentheses, it would match "cataract",
"erpillar" or the empty string.
2. It sets up the subpattern as a capturing subpattern (as
defined above). When the whole pattern matches, that por-
tion of the subject string that matched the subpattern is
passed back to the caller via the ovector argument of
pcre_exec(). Opening parentheses are counted from left to
right (starting from 1) to obtain the numbers of the captur-
ing subpatterns.
For example, if the string "the red king" is matched against
the pattern
the ((red|white) (king|queen))
the captured substrings are "red king", "red", and "king",
and are numbered 1, 2, and 3.
The fact that plain parentheses fulfil two functions is not
always helpful. There are often times when a grouping sub-
pattern is required without a capturing requirement. If an
opening parenthesis is followed by "?:", the subpattern does
not do any capturing, and is not counted when computing the
number of any subsequent capturing subpatterns. For example,
if the string "the white queen" is matched against the pat-
tern
the ((?:red|white) (king|queen))
the captured substrings are "white queen" and "queen", and
are numbered 1 and 2. The maximum number of captured sub-
strings is 99, and the maximum number of all subpatterns,
both capturing and non-capturing, is 200.
As a convenient shorthand, if any option settings are
required at the start of a non-capturing subpattern, the
option letters may appear between the "?" and the ":". Thus
the two patterns
(?i:saturday|sunday)
(?:(?i)saturday|sunday)
match exactly the same set of strings. Because alternative
branches are tried from left to right, and options are not
reset until the end of the subpattern is reached, an option
setting in one branch does affect subsequent branches, so
the above patterns match "SUNDAY" as well as "Saturday".
REPETITION
Repetition is specified by quantifiers, which can follow any
of the following items:
a single character, possibly escaped
the . metacharacter
a character class
a back reference (see next section)
a parenthesized subpattern (unless it is an assertion -
see below)
The general repetition quantifier specifies a minimum and
maximum number of permitted matches, by giving the two
numbers in curly brackets (braces), separated by a comma.
The numbers must be less than 65536, and the first must be
less than or equal to the second. For example:
z{2,4}
matches "zz", "zzz", or "zzzz". A closing brace on its own
is not a special character. If the second number is omitted,
but the comma is present, there is no upper limit; if the
second number and the comma are both omitted, the quantifier
specifies an exact number of required matches. Thus
[aeiou]{3,}
matches at least 3 successive vowels, but may match many
more, while
\d{8}
matches exactly 8 digits. An opening curly bracket that
appears in a position where a quantifier is not allowed, or
one that does not match the syntax of a quantifier, is taken
as a literal character. For example, {,6} is not a quantif-
ier, but a literal string of four characters.
The quantifier {0} is permitted, causing the expression to
behave as if the previous item and the quantifier were not
present.
For convenience (and historical compatibility) the three
most common quantifiers have single-character abbreviations:
* is equivalent to {0,}
+ is equivalent to {1,}
? is equivalent to {0,1}
It is possible to construct infinite loops by following a
subpattern that can match no characters with a quantifier
that has no upper limit, for example:
(a?)*
Earlier versions of Perl and PCRE used to give an error at
compile time for such patterns. However, because there are
cases where this can be useful, such patterns are now
accepted, but if any repetition of the subpattern does in
fact match no characters, the loop is forcibly broken.
By default, the quantifiers are "greedy", that is, they
match as much as possible (up to the maximum number of per-
mitted times), without causing the rest of the pattern to
fail. The classic example of where this gives problems is in
trying to match comments in C programs. These appear between
the sequences /* and */ and within the sequence, individual
* and / characters may appear. An attempt to match C com-
ments by applying the pattern
/\*.*\*/
to the string
/* first command */ not comment /* second comment */
fails, because it matches the entire string owing to the
greediness of the .* item.
However, if a quantifier is followed by a question mark, it
ceases to be greedy, and instead matches the minimum number
of times possible, so the pattern
/\*.*?\*/
does the right thing with the C comments. The meaning of the
various quantifiers is not otherwise changed, just the pre-
ferred number of matches. Do not confuse this use of ques-
tion mark with its use as a quantifier in its own right.
Because it has two uses, it can sometimes appear doubled, as
in
\d??\d
which matches one digit by preference, but can match two if
that is the only way the rest of the pattern matches.
If the PCRE_UNGREEDY option is set (an option which is not
available in Perl), the quantifiers are not greedy by
default, but individual ones can be made greedy by following
them with a question mark. In other words, it inverts the
default behaviour.
When a parenthesized subpattern is quantified with a minimum
repeat count that is greater than 1 or with a limited max-
imum, more store is required for the compiled pattern, in
proportion to the size of the minimum or maximum.
If a pattern starts with .* or .{0,} and the PCRE_DOTALL
option (equivalent to Perl's /s) is set, thus allowing the .
to match newlines, the pattern is implicitly anchored,
because whatever follows will be tried against every charac-
ter position in the subject string, so there is no point in
retrying the overall match at any position after the first.
PCRE treats such a pattern as though it were preceded by \A.
In cases where it is known that the subject string contains
no newlines, it is worth setting PCRE_DOTALL when the pat-
tern begins with .* in order to obtain this optimization, or
alternatively using ^ to indicate anchoring explicitly.
When a capturing subpattern is repeated, the value captured
is the substring that matched the final iteration. For exam-
ple, after
(tweedle[dume]{3}\s*)+
has matched "tweedledum tweedledee" the value of the cap-
tured substring is "tweedledee". However, if there are
nested capturing subpatterns, the corresponding captured
values may have been set in previous iterations. For exam-
ple, after
/(a|(b))+/
matches "aba" the value of the second captured substring is
"b".
BACK REFERENCES
Outside a character class, a backslash followed by a digit
greater than 0 (and possibly further digits) is a back
reference to a capturing subpattern earlier (i.e. to its
left) in the pattern, provided there have been that many
previous capturing left parentheses.
However, if the decimal number following the backslash is
less than 10, it is always taken as a back reference, and
causes an error only if there are not that many capturing
left parentheses in the entire pattern. In other words, the
parentheses that are referenced need not be to the left of
the reference for numbers less than 10. See the section
entitled "Backslash" above for further details of the han-
dling of digits following a backslash.
A back reference matches whatever actually matched the cap-
turing subpattern in the current subject string, rather than
anything matching the subpattern itself. So the pattern
(sens|respons)e and \1ibility
matches "sense and sensibility" and "response and responsi-
bility", but not "sense and responsibility". If caseful
matching is in force at the time of the back reference, the
case of letters is relevant. For example,
((?i)rah)\s+\1
matches "rah rah" and "RAH RAH", but not "RAH rah", even
though the original capturing subpattern is matched case-
lessly.
There may be more than one back reference to the same sub-
pattern. If a subpattern has not actually been used in a
particular match, any back references to it always fail. For
example, the pattern
(a|(bc))\2
always fails if it starts to match "a" rather than "bc".
Because there may be up to 99 back references, all digits
following the backslash are taken as part of a potential
back reference number. If the pattern continues with a digit
character, some delimiter must be used to terminate the back
reference. If the PCRE_EXTENDED option is set, this can be
whitespace. Otherwise an empty comment can be used.
A back reference that occurs inside the parentheses to which
it refers fails when the subpattern is first used, so, for
example, (a\1) never matches. However, such references can
be useful inside repeated subpatterns. For example, the pat-
tern
(a|b\1)+
matches any number of "a"s and also "aba", "ababbaa" etc. At
each iteration of the subpattern, the back reference matches
the character string corresponding to the previous
iteration. In order for this to work, the pattern must be
such that the first iteration does not need to match the
back reference. This can be done using alternation, as in
the example above, or by a quantifier with a minimum of
zero.
ASSERTIONS
An assertion is a test on the characters following or
preceding the current matching point that does not actually
consume any characters. The simple assertions coded as \b,
\B, \A, \Z, \z, ^ and $ are described above. More compli-
cated assertions are coded as subpatterns. There are two
kinds: those that look ahead of the current position in the
subject string, and those that look behind it.
An assertion subpattern is matched in the normal way, except
that it does not cause the current matching position to be
changed. Lookahead assertions start with (?= for positive
assertions and (?! for negative assertions. For example,
\w+(?=;)
matches a word followed by a semicolon, but does not include
the semicolon in the match, and
foo(?!bar)
matches any occurrence of "foo" that is not followed by
"bar". Note that the apparently similar pattern
(?!foo)bar
does not find an occurrence of "bar" that is preceded by
something other than "foo"; it finds any occurrence of "bar"
whatsoever, because the assertion (?!foo) is always true
when the next three characters are "bar". A lookbehind
assertion is needed to achieve this effect.
Lookbehind assertions start with (?<= for positive asser-
tions and (? as in this example:
(?>\d+)bar
This kind of parenthesis "locks up" the part of the pattern
it contains once it has matched, and a failure further into
the pattern is prevented from backtracking into it.
Backtracking past it to previous items, however, works as
normal.
An alternative description is that a subpattern of this type
matches the string of characters that an identical stan-
dalone pattern would match, if anchored at the current point
in the subject string.
Once-only subpatterns are not capturing subpatterns. Simple
cases such as the above example can be thought of as a max-
imizing repeat that must swallow everything it can. So,
while both \d+ and \d+? are prepared to adjust the number of
digits they match in order to make the rest of the pattern
match, (?>\d+) can only match an entire sequence of digits.
This construction can of course contain arbitrarily compli-
cated subpatterns, and it can be nested.
Once-only subpatterns can be used in conjunction with look-
behind assertions to specify efficient matching at the end
of the subject string. Consider a simple pattern such as
abcd$
when applied to a long string which does not match. Because
matching proceeds from left to right, PCRE will look for
each "a" in the subject and then see if what follows matches
the rest of the pattern. If the pattern is specified as
^.*abcd$
the initial .* matches the entire string at first, but when
this fails (because there is no following "a"), it back-
tracks to match all but the last character, then all but the
last two characters, and so on. Once again the search for
"a" covers the entire string, from right to left, so we are
no better off. However, if the pattern is written as
^(?>.*)(?<=abcd)
there can be no backtracking for the .* item; it can match
only the entire string. The subsequent lookbehind assertion
does a single test on the last four characters. If it fails,
the match fails immediately. For long strings, this approach
makes a significant difference to the processing time.
When a pattern contains an unlimited repeat inside a subpat-
tern that can itself be repeated an unlimited number of
times, the use of a once-only subpattern is the only way to
avoid some failing matches taking a very long time indeed.
The pattern
(\D+|<\d+>)*[!?]
matches an unlimited number of substrings that either con-
sist of non-digits, or digits enclosed in <>, followed by
either ! or ?. When it matches, it runs quickly. However, if
it is applied to
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
it takes a long time before reporting failure. This is
because the string can be divided between the two repeats in
a large number of ways, and all have to be tried. (The exam-
ple used [!?] rather than a single character at the end,
because both PCRE and Perl have an optimization that allows
for fast failure when a single character is used. They
remember the last single character that is required for a
match, and fail early if it is not present in the string.)
If the pattern is changed to
((?>\D+)|<\d+>)*[!?]
sequences of non-digits cannot be broken, and failure hap-
pens quickly.
CONDITIONAL SUBPATTERNS
It is possible to cause the matching process to obey a sub-
pattern conditionally or to choose between two alternative
subpatterns, depending on the result of an assertion, or
whether a previous capturing subpattern matched or not. The
two possible forms of conditional subpattern are
(?(condition)yes-pattern)
(?(condition)yes-pattern|no-pattern)
If the condition is satisfied, the yes-pattern is used; oth-
erwise the no-pattern (if present) is used. If there are
more than two alternatives in the subpattern, a compile-time
error occurs.
There are two kinds of condition. If the text between the
parentheses consists of a sequence of digits, the condition
is satisfied if the capturing subpattern of that number has
previously matched. The number must be greater than zero.
Consider the following pattern, which contains non-
significant white space to make it more readable (assume the
PCRE_EXTENDED option) and to divide it into three parts for
ease of discussion:
( \( )? [^()]+ (?(1) \) )
The first part matches an optional opening parenthesis, and
if that character is present, sets it as the first captured
substring. The second part matches one or more characters
that are not parentheses. The third part is a conditional
subpattern that tests whether the first set of parentheses
matched or not. If they did, that is, if subject started
with an opening parenthesis, the condition is true, and so
the yes-pattern is executed and a closing parenthesis is
required. Otherwise, since no-pattern is not present, the
subpattern matches nothing. In other words, this pattern
matches a sequence of non-parentheses, optionally enclosed
in parentheses.
If the condition is not a sequence of digits, it must be an
assertion. This may be a positive or negative lookahead or
lookbehind assertion. Consider this pattern, again contain-
ing non-significant white space, and with the two alterna-
tives on the second line:
(?(?=[^a-z]*[a-z])
\d{2}-[a-z]{3}-\d{2} | \d{2}-\d{2}-\d{2} )
The condition is a positive lookahead assertion that matches
an optional sequence of non-letters followed by a letter. In
other words, it tests for the presence of at least one
letter in the subject. If a letter is found, the subject is
matched against the first alternative; otherwise it is
matched against the second. This pattern matches strings in
one of the two forms dd-aaa-dd or dd-dd-dd, where aaa are
letters and dd are digits.
COMMENTS
The sequence (?# marks the start of a comment which contin-
ues up to the next closing parenthesis. Nested parentheses
are not permitted. The characters that make up a comment
play no part in the pattern matching at all.
If the PCRE_EXTENDED option is set, an unescaped # character
outside a character class introduces a comment that contin-
ues up to the next newline character in the pattern.
RECURSIVE PATTERNS
Consider the problem of matching a string in parentheses,
allowing for unlimited nested parentheses. Without the use
of recursion, the best that can be done is to use a pattern
that matches up to some fixed depth of nesting. It is not
possible to handle an arbitrary nesting depth. Perl 5.6 has
provided an experimental facility that allows regular
expressions to recurse (amongst other things). It does this
by interpolating Perl code in the expression at run time,
and the code can refer to the expression itself. A Perl pat-
tern to solve the parentheses problem can be created like
this:
$re = qr{\( (?: (?>[^()]+) | (?p{$re}) )* \)}x;
The (?p{...}) item interpolates Perl code at run time, and
in this case refers recursively to the pattern in which it
appears. Obviously, PCRE cannot support the interpolation of
Perl code. Instead, the special item (?R) is provided for
the specific case of recursion. This PCRE pattern solves the
parentheses problem (assume the PCRE_EXTENDED option is set
so that white space is ignored):
\( ( (?>[^()]+) | (?R) )* \)
First it matches an opening parenthesis. Then it matches any
number of substrings which can either be a sequence of non-
parentheses, or a recursive match of the pattern itself
(i.e. a correctly parenthesized substring). Finally there is
a closing parenthesis.
This particular example pattern contains nested unlimited
repeats, and so the use of a once-only subpattern for match-
ing strings of non-parentheses is important when applying
the pattern to strings that do not match. For example, when
it is applied to
(aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa()
it yields "no match" quickly. However, if a once-only sub-
pattern is not used, the match runs for a very long time
indeed because there are so many different ways the + and *
repeats can carve up the subject, and all have to be tested
before failure can be reported.
The values set for any capturing subpatterns are those from
the outermost level of the recursion at which the subpattern
value is set. If the pattern above is matched against
(ab(cd)ef)
the value for the capturing parentheses is "ef", which is
the last value taken on at the top level. If additional
parentheses are added, giving
\( ( ( (?>[^()]+) | (?R) )* ) \)
^ ^
^ ^ the string they capture is
"ab(cd)ef", the contents of the top level parentheses. If
there are more than 15 capturing parentheses in a pattern,
PCRE has to obtain extra memory to store data during a
recursion, which it does by using pcre_malloc, freeing it
via pcre_free afterwards. If no memory can be obtained, it
saves data for the first 15 capturing parentheses only, as
there is no way to give an out-of-memory error from within a
recursion.
PERFORMANCE
Certain items that may appear in patterns are more efficient
than others. It is more efficient to use a character class
like [aeiou] than a set of alternatives such as (a|e|i|o|u).
In general, the simplest construction that provides the
required behaviour is usually the most efficient. Jeffrey
Friedl's book contains a lot of discussion about optimizing
regular expressions for efficient performance.
When a pattern begins with .* and the PCRE_DOTALL option is
set, the pattern is implicitly anchored by PCRE, since it
can match only at the start of a subject string. However, if
PCRE_DOTALL is not set, PCRE cannot make this optimization,
because the . metacharacter does not then match a newline,
and if the subject string contains newlines, the pattern may
match from the character immediately following one of them
instead of from the very start. For example, the pattern
(.*) second
matches the subject "first\nand second" (where \n stands for
a newline character) with the first captured substring being
"and". In order to do this, PCRE has to retry the match
starting after every newline in the subject.
If you are using such a pattern with subject strings that do
not contain newlines, the best performance is obtained by
setting PCRE_DOTALL, or starting the pattern with ^.* to
indicate explicit anchoring. That saves PCRE from having to
scan along the subject looking for a newline to restart at.
Beware of patterns that contain nested indefinite repeats.
These can take a long time to run when applied to a string
that does not match. Consider the pattern fragment
(a+)*
This can match "aaaa" in 33 different ways, and this number
increases very rapidly as the string gets longer. (The *
repeat can match 0, 1, 2, 3, or 4 times, and for each of
those cases other than 0, the + repeats can match different
numbers of times.) When the remainder of the pattern is such
that the entire match is going to fail, PCRE has in princi-
ple to try every possible variation, and this can take an
extremely long time.
An optimization catches some of the more simple cases such
as
(a+)*b
where a literal character follows. Before embarking on the
standard matching procedure, PCRE checks that there is a "b"
later in the subject string, and if there is not, it fails
the match immediately. However, when there is no following
literal this optimization cannot be used. You can see the
difference by comparing the behaviour of
(a+)*\d
with the pattern above. The former gives a failure almost
instantly when applied to a whole line of "a" characters,
whereas the latter takes an appreciable time with strings
longer than about 20 characters.
UTF-8 SUPPORT
Starting at release 3.3, PCRE has some support for character
strings encoded in the UTF-8 format. This is incomplete, and
is regarded as experimental. In order to use it, you must
configure PCRE to include UTF-8 support in the code, and, in
addition, you must call pcre_compile() with the PCRE_UTF8
option flag. When you do this, both the pattern and any sub-
ject strings that are matched against it are treated as
UTF-8 strings instead of just strings of bytes, but only in
the cases that are mentioned below.
If you compile PCRE with UTF-8 support, but do not use it at
run time, the library will be a bit bigger, but the addi-
tional run time overhead is limited to testing the PCRE_UTF8
flag in several places, so should not be very large.
PCRE assumes that the strings it is given contain valid
UTF-8 codes. It does not diagnose invalid UTF-8 strings. If
you pass invalid UTF-8 strings to PCRE, the results are
undefined.
Running with PCRE_UTF8 set causes these changes in the way
PCRE works:
1. In a pattern, the escape sequence \x{...}, where the con-
tents of the braces is a string of hexadecimal digits, is
interpreted as a UTF-8 character whose code number is the
given hexadecimal number, for example: \x{1234}. This
inserts from one to six literal bytes into the pattern,
using the UTF-8 encoding. If a non-hexadecimal digit appears
between the braces, the item is not recognized.
2. The original hexadecimal escape sequence, \xhh, generates
a two-byte UTF-8 character if its value is greater than 127.
3. Repeat quantifiers are NOT correctly handled if they fol-
low a multibyte character. For example, \x{100}* and \xc3+
do not work. If you want to repeat such characters, you must
enclose them in non-capturing parentheses, for example
(?:\x{100}), at present.
4. The dot metacharacter matches one UTF-8 character instead
of a single byte.
5. Unlike literal UTF-8 characters, the dot metacharacter
followed by a repeat quantifier does operate correctly on
UTF-8 characters instead of single bytes.
4. Although the \x{...} escape is permitted in a character
class, characters whose values are greater than 255 cannot
be included in a class.
5. A class is matched against a UTF-8 character instead of
just a single byte, but it can match only characters whose
values are less than 256. Characters with greater values
always fail to match a class.
6. Repeated classes work correctly on multiple characters.
7. Classes containing just a single character whose value is
greater than 127 (but less than 256), for example, [\x80] or
[^\x{93}], do not work because these are optimized into sin-
gle byte matches. In the first case, of course, the class
brackets are just redundant.
8. Lookbehind assertions move backwards in the subject by a
fixed number of characters instead of a fixed number of
bytes. Simple cases have been tested to work correctly, but
there may be hidden gotchas herein.
9. The character types such as \d and \w do not work
correctly with UTF-8 characters. They continue to test a
single byte.
10. Anything not explicitly mentioned here continues to work
in bytes rather than in characters.
The following UTF-8 features of Perl 5.6 are not imple-
mented:
1. The escape sequence \C to match a single byte.
2. The use of Unicode tables and properties and escapes \p,
\P, and \X.
AUTHOR
Philip Hazel
University Computing Service,
New Museums Site,
Cambridge CB2 3QG, England.
Phone: +44 1223 334714
Last updated: 28 August 2000,
the 250th anniversary of the death of J.S. Bach.
Copyright (c) 1997-2000 University of Cambridge.
privoxy-3.0.21-stable/./pcre/doc/NON-UNIX-USE 000640 001751 001751 00000004310 10546014077 017252 0 ustar 00fk fk 000000 000000 Compiling PCRE on non-Unix systems
----------------------------------
If you want to compile PCRE for a non-Unix system, note that it consists
entirely of code written in Standard C, and so should compile successfully
on any machine with a Standard C compiler and library, using normal compiling
commands to do the following:
(1) Copy or rename the file config.in as config.h, and change the macros that
define HAVE_STRERROR and HAVE_MEMMOVE to define them as 1 rather than 0.
Unfortunately, because of the way Unix autoconf works, the default setting has
to be 0.
(2) Copy or rename the file pcre.in as pcre.h, and change the macro definitions
for PCRE_MAJOR, PCRE_MINOR, and PCRE_DATE near its start to the values set in
configure.in.
(3) Compile dftables.c as a stand-alone program, and then run it with
the standard output sent to chartables.c. This generates a set of standard
character tables.
(4) Compile maketables.c, get.c, study.c and pcre.c and link them all
together into an object library in whichever form your system keeps such
libraries. This is the pcre library (chartables.c gets included by means of an
#include directive).
(5) Similarly, compile pcreposix.c and link it as the pcreposix library.
(6) Compile the test program pcretest.c. This needs the functions in the
pcre and pcreposix libraries when linking.
(7) Run pcretest on the testinput files in the testdata directory, and check
that the output matches the corresponding testoutput files. You must use the
-i option when checking testinput2.
If you have a system without "configure" but where you can use a Makefile, edit
Makefile.in to create Makefile, substituting suitable values for the variables
at the head of the file.
Some help in building a Win32 DLL of PCRE in GnuWin32 environments was
contributed by Paul.Sokolovsky@technologist.com. These environments are
Mingw32 (http://www.xraylith.wisc.edu/~khan/software/gnu-win32/) and
CygWin (http://sourceware.cygnus.com/cygwin/). Paul comments:
For CygWin, set CFLAGS=-mno-cygwin, and do 'make dll'. You'll get
pcre.dll (containing pcreposix also), libpcre.dll.a, and dynamically
linked pgrep and pcretest. If you have /bin/sh, run RunTest (three
main test go ok, locale not supported).
****
privoxy-3.0.21-stable/./pcre/doc/ChangeLog 000640 001751 001751 00000060346 10546014077 017247 0 ustar 00fk fk 000000 000000 ChangeLog for PCRE
------------------
Version 3.4 22-Aug-00
---------------------
1. Fixed typo in pcre.h: unsigned const char * changed to const unsigned char *.
2. Diagnose condition (?(0) as an error instead of crashing on matching.
Version 3.3 01-Aug-00
---------------------
1. If an octal character was given, but the value was greater than \377, it
was not getting masked to the least significant bits, as documented. This could
lead to crashes in some systems.
2. Perl 5.6 (if not earlier versions) accepts classes like [a-\d] and treats
the hyphen as a literal. PCRE used to give an error; it now behaves like Perl.
3. Added the functions pcre_free_substring() and pcre_free_substring_list().
These just pass their arguments on to (pcre_free)(), but they are provided
because some uses of PCRE bind it to non-C systems that can call its functions,
but cannot call free() or pcre_free() directly.
4. Add "make test" as a synonym for "make check". Corrected some comments in
the Makefile.
5. Add $(DESTDIR)/ in front of all the paths in the "install" target in the
Makefile.
6. Changed the name of pgrep to pcregrep, because Solaris has introduced a
command called pgrep for grepping around the active processes.
7. Added the beginnings of support for UTF-8 character strings.
8. Arranged for the Makefile to pass over the settings of CC, CFLAGS, and
RANLIB to ./ltconfig so that they are used by libtool. I think these are all
the relevant ones. (AR is not passed because ./ltconfig does its own figuring
out for the ar command.)
Version 3.2 12-May-00
---------------------
This is purely a bug fixing release.
1. If the pattern /((Z)+|A)*/ was matched agained ZABCDEFG it matched Z instead
of ZA. This was just one example of several cases that could provoke this bug,
which was introduced by change 9 of version 2.00. The code for breaking
infinite loops after an iteration that matches an empty string was't working
correctly.
2. The pcretest program was not imitating Perl correctly for the pattern /a*/g
when matched against abbab (for example). After matching an empty string, it
wasn't forcing anchoring when setting PCRE_NOTEMPTY for the next attempt; this
caused it to match further down the string than it should.
3. The code contained an inclusion of sys/types.h. It isn't clear why this
was there because it doesn't seem to be needed, and it causes trouble on some
systems, as it is not a Standard C header. It has been removed.
4. Made 4 silly changes to the source to avoid stupid compiler warnings that
were reported on the Macintosh. The changes were from
while ((c = *(++ptr)) != 0 && c != '\n');
to
while ((c = *(++ptr)) != 0 && c != '\n') ;
Totally extraordinary, but if that's what it takes...
5. PCRE is being used in one environment where neither memmove() nor bcopy() is
available. Added HAVE_BCOPY and an autoconf test for it; if neither
HAVE_MEMMOVE nor HAVE_BCOPY is set, use a built-in emulation function which
assumes the way PCRE uses memmove() (always moving upwards).
6. PCRE is being used in one environment where strchr() is not available. There
was only one use in pcre.c, and writing it out to avoid strchr() probably gives
faster code anyway.
Version 3.1 09-Feb-00
---------------------
The only change in this release is the fixing of some bugs in Makefile.in for
the "install" target:
(1) It was failing to install pcreposix.h.
(2) It was overwriting the pcre.3 man page with the pcreposix.3 man page.
Version 3.0 01-Feb-00
---------------------
1. Add support for the /+ modifier to perltest (to output $` like it does in
pcretest).
2. Add support for the /g modifier to perltest.
3. Fix pcretest so that it behaves even more like Perl for /g when the pattern
matches null strings.
4. Fix perltest so that it doesn't do unwanted things when fed an empty
pattern. Perl treats empty patterns specially - it reuses the most recent
pattern, which is not what we want. Replace // by /(?#)/ in order to avoid this
effect.
5. The POSIX interface was broken in that it was just handing over the POSIX
captured string vector to pcre_exec(), but (since release 2.00) PCRE has
required a bigger vector, with some working space on the end. This means that
the POSIX wrapper now has to get and free some memory, and copy the results.
6. Added some simple autoconf support, placing the test data and the
documentation in separate directories, re-organizing some of the
information files, and making it build pcre-config (a GNU standard). Also added
libtool support for building PCRE as a shared library, which is now the
default.
7. Got rid of the leading zero in the definition of PCRE_MINOR because 08 and
09 are not valid octal constants. Single digits will be used for minor values
less than 10.
8. Defined REG_EXTENDED and REG_NOSUB as zero in the POSIX header, so that
existing programs that set these in the POSIX interface can use PCRE without
modification.
9. Added a new function, pcre_fullinfo() with an extensible interface. It can
return all that pcre_info() returns, plus additional data. The pcre_info()
function is retained for compatibility, but is considered to be obsolete.
10. Added experimental recursion feature (?R) to handle one common case that
Perl 5.6 will be able to do with (?p{...}).
11. Added support for POSIX character classes like [:alpha:], which Perl is
adopting.
Version 2.08 31-Aug-99
----------------------
1. When startoffset was not zero and the pattern began with ".*", PCRE was not
trying to match at the startoffset position, but instead was moving forward to
the next newline as if a previous match had failed.
2. pcretest was not making use of PCRE_NOTEMPTY when repeating for /g and /G,
and could get into a loop if a null string was matched other than at the start
of the subject.
3. Added definitions of PCRE_MAJOR and PCRE_MINOR to pcre.h so the version can
be distinguished at compile time, and for completeness also added PCRE_DATE.
5. Added Paul Sokolovsky's minor changes to make it easy to compile a Win32 DLL
in GnuWin32 environments.
Version 2.07 29-Jul-99
----------------------
1. The documentation is now supplied in plain text form and HTML as well as in
the form of man page sources.
2. C++ compilers don't like assigning (void *) values to other pointer types.
In particular this affects malloc(). Although there is no problem in Standard
C, I've put in casts to keep C++ compilers happy.
3. Typo on pcretest.c; a cast of (unsigned char *) in the POSIX regexec() call
should be (const char *).
4. If NOPOSIX is defined, pcretest.c compiles without POSIX support. This may
be useful for non-Unix systems who don't want to bother with the POSIX stuff.
However, I haven't made this a standard facility. The documentation doesn't
mention it, and the Makefile doesn't support it.
5. The Makefile now contains an "install" target, with editable destinations at
the top of the file. The pcretest program is not installed.
6. pgrep -V now gives the PCRE version number and date.
7. Fixed bug: a zero repetition after a literal string (e.g. /abcde{0}/) was
causing the entire string to be ignored, instead of just the last character.
8. If a pattern like /"([^\\"]+|\\.)*"/ is applied in the normal way to a
non-matching string, it can take a very, very long time, even for strings of
quite modest length, because of the nested recursion. PCRE now does better in
some of these cases. It does this by remembering the last required literal
character in the pattern, and pre-searching the subject to ensure it is present
before running the real match. In other words, it applies a heuristic to detect
some types of certain failure quickly, and in the above example, if presented
with a string that has no trailing " it gives "no match" very quickly.
9. A new runtime option PCRE_NOTEMPTY causes null string matches to be ignored;
other alternatives are tried instead.
Version 2.06 09-Jun-99
----------------------
1. Change pcretest's output for amount of store used to show just the code
space, because the remainder (the data block) varies in size between 32-bit and
64-bit systems.
2. Added an extra argument to pcre_exec() to supply an offset in the subject to
start matching at. This allows lookbehinds to work when searching for multiple
occurrences in a string.
3. Added additional options to pcretest for testing multiple occurrences:
/+ outputs the rest of the string that follows a match
/g loops for multiple occurrences, using the new startoffset argument
/G loops for multiple occurrences by passing an incremented pointer
4. PCRE wasn't doing the "first character" optimization for patterns starting
with \b or \B, though it was doing it for other lookbehind assertions. That is,
it wasn't noticing that a match for a pattern such as /\bxyz/ has to start with
the letter 'x'. On long subject strings, this gives a significant speed-up.
Version 2.05 21-Apr-99
----------------------
1. Changed the type of magic_number from int to long int so that it works
properly on 16-bit systems.
2. Fixed a bug which caused patterns starting with .* not to work correctly
when the subject string contained newline characters. PCRE was assuming
anchoring for such patterns in all cases, which is not correct because .* will
not pass a newline unless PCRE_DOTALL is set. It now assumes anchoring only if
DOTALL is set at top level; otherwise it knows that patterns starting with .*
must be retried after every newline in the subject.
Version 2.04 18-Feb-99
----------------------
1. For parenthesized subpatterns with repeats whose minimum was zero, the
computation of the store needed to hold the pattern was incorrect (too large).
If such patterns were nested a few deep, this could multiply and become a real
problem.
2. Added /M option to pcretest to show the memory requirement of a specific
pattern. Made -m a synonym of -s (which does this globally) for compatibility.
3. Subpatterns of the form (regex){n,m} (i.e. limited maximum) were being
compiled in such a way that the backtracking after subsequent failure was
pessimal. Something like (a){0,3} was compiled as (a)?(a)?(a)? instead of
((a)((a)(a)?)?)? with disastrous performance if the maximum was of any size.
Version 2.03 02-Feb-99
----------------------
1. Fixed typo and small mistake in man page.
2. Added 4th condition (GPL supersedes if conflict) and created separate
LICENCE file containing the conditions.
3. Updated pcretest so that patterns such as /abc\/def/ work like they do in
Perl, that is the internal \ allows the delimiter to be included in the
pattern. Locked out the use of \ as a delimiter. If \ immediately follows
the final delimiter, add \ to the end of the pattern (to test the error).
4. Added the convenience functions for extracting substrings after a successful
match. Updated pcretest to make it able to test these functions.
Version 2.02 14-Jan-99
----------------------
1. Initialized the working variables associated with each extraction so that
their saving and restoring doesn't refer to uninitialized store.
2. Put dummy code into study.c in order to trick the optimizer of the IBM C
compiler for OS/2 into generating correct code. Apparently IBM isn't going to
fix the problem.
3. Pcretest: the timing code wasn't using LOOPREPEAT for timing execution
calls, and wasn't printing the correct value for compiling calls. Increased the
default value of LOOPREPEAT, and the number of significant figures in the
times.
4. Changed "/bin/rm" in the Makefile to "-rm" so it works on Windows NT.
5. Renamed "deftables" as "dftables" to get it down to 8 characters, to avoid
a building problem on Windows NT with a FAT file system.
Version 2.01 21-Oct-98
----------------------
1. Changed the API for pcre_compile() to allow for the provision of a pointer
to character tables built by pcre_maketables() in the current locale. If NULL
is passed, the default tables are used.
Version 2.00 24-Sep-98
----------------------
1. Since the (>?) facility is in Perl 5.005, don't require PCRE_EXTRA to enable
it any more.
2. Allow quantification of (?>) groups, and make it work correctly.
3. The first character computation wasn't working for (?>) groups.
4. Correct the implementation of \Z (it is permitted to match on the \n at the
end of the subject) and add 5.005's \z, which really does match only at the
very end of the subject.
5. Remove the \X "cut" facility; Perl doesn't have it, and (?> is neater.
6. Remove the ability to specify CASELESS, MULTILINE, DOTALL, and
DOLLAR_END_ONLY at runtime, to make it possible to implement the Perl 5.005
localized options. All options to pcre_study() were also removed.
7. Add other new features from 5.005:
$(?<= positive lookbehind
$(?a*))*/ (a PCRE_EXTRA facility).
Version 1.00 18-Nov-97
----------------------
1. Added compile-time macros to support systems such as SunOS4 which don't have
memmove() or strerror() but have other things that can be used instead.
2. Arranged that "make clean" removes the executables.
Version 0.99 27-Oct-97
----------------------
1. Fixed bug in code for optimizing classes with only one character. It was
initializing a 32-byte map regardless, which could cause it to run off the end
of the memory it had got.
2. Added, conditional on PCRE_EXTRA, the proposed (?>REGEX) construction.
Version 0.98 22-Oct-97
----------------------
1. Fixed bug in code for handling temporary memory usage when there are more
back references than supplied space in the ovector. This could cause segfaults.
Version 0.97 21-Oct-97
----------------------
1. Added the \X "cut" facility, conditional on PCRE_EXTRA.
2. Optimized negated single characters not to use a bit map.
3. Brought error texts together as macro definitions; clarified some of them;
fixed one that was wrong - it said "range out of order" when it meant "invalid
escape sequence".
4. Changed some char * arguments to const char *.
5. Added PCRE_NOTBOL and PCRE_NOTEOL (from POSIX).
6. Added the POSIX-style API wrapper in pcreposix.a and testing facilities in
pcretest.
Version 0.96 16-Oct-97
----------------------
1. Added a simple "pgrep" utility to the distribution.
2. Fixed an incompatibility with Perl: "{" is now treated as a normal character
unless it appears in one of the precise forms "{ddd}", "{ddd,}", or "{ddd,ddd}"
where "ddd" means "one or more decimal digits".
3. Fixed serious bug. If a pattern had a back reference, but the call to
pcre_exec() didn't supply a large enough ovector to record the related
identifying subpattern, the match always failed. PCRE now remembers the number
of the largest back reference, and gets some temporary memory in which to save
the offsets during matching if necessary, in order to ensure that
backreferences always work.
4. Increased the compatibility with Perl in a number of ways:
(a) . no longer matches \n by default; an option PCRE_DOTALL is provided
to request this handling. The option can be set at compile or exec time.
(b) $ matches before a terminating newline by default; an option
PCRE_DOLLAR_ENDONLY is provided to override this (but not in multiline
mode). The option can be set at compile or exec time.
(c) The handling of \ followed by a digit other than 0 is now supposed to be
the same as Perl's. If the decimal number it represents is less than 10
or there aren't that many previous left capturing parentheses, an octal
escape is read. Inside a character class, it's always an octal escape,
even if it is a single digit.
(d) An escaped but undefined alphabetic character is taken as a literal,
unless PCRE_EXTRA is set. Currently this just reserves the remaining
escapes.
(e) {0} is now permitted. (The previous item is removed from the compiled
pattern).
5. Changed all the names of code files so that the basic parts are no longer
than 10 characters, and abolished the teeny "globals.c" file.
6. Changed the handling of character classes; they are now done with a 32-byte
bit map always.
7. Added the -d and /D options to pcretest to make it possible to look at the
internals of compilation without having to recompile pcre.
Version 0.95 23-Sep-97
----------------------
1. Fixed bug in pre-pass concerning escaped "normal" characters such as \x5c or
\x20 at the start of a run of normal characters. These were being treated as
real characters, instead of the source characters being re-checked.
Version 0.94 18-Sep-97
----------------------
1. The functions are now thread-safe, with the caveat that the global variables
containing pointers to malloc() and free() or alternative functions are the
same for all threads.
2. Get pcre_study() to generate a bitmap of initial characters for non-
anchored patterns when this is possible, and use it if passed to pcre_exec().
Version 0.93 15-Sep-97
----------------------
1. /(b)|(:+)/ was computing an incorrect first character.
2. Add pcre_study() to the API and the passing of pcre_extra to pcre_exec(),
but not actually doing anything yet.
3. Treat "-" characters in classes that cannot be part of ranges as literals,
as Perl does (e.g. [-az] or [az-]).
4. Set the anchored flag if a branch starts with .* or .*? because that tests
all possible positions.
5. Split up into different modules to avoid including unneeded functions in a
compiled binary. However, compile and exec are still in one module. The "study"
function is split off.
6. The character tables are now in a separate module whose source is generated
by an auxiliary program - but can then be edited by hand if required. There are
now no calls to isalnum(), isspace(), isdigit(), isxdigit(), tolower() or
toupper() in the code.
7. Turn the malloc/free funtions variables into pcre_malloc and pcre_free and
make them global. Abolish the function for setting them, as the caller can now
set them directly.
Version 0.92 11-Sep-97
----------------------
1. A repeat with a fixed maximum and a minimum of 1 for an ordinary character
(e.g. /a{1,3}/) was broken (I mis-optimized it).
2. Caseless matching was not working in character classes if the characters in
the pattern were in upper case.
3. Make ranges like [W-c] work in the same way as Perl for caseless matching.
4. Make PCRE_ANCHORED public and accept as a compile option.
5. Add an options word to pcre_exec() and accept PCRE_ANCHORED and
PCRE_CASELESS at run time. Add escapes \A and \I to pcretest to cause it to
pass them.
6. Give an error if bad option bits passed at compile or run time.
7. Add PCRE_MULTILINE at compile and exec time, and (?m) as well. Add \M to
pcretest to cause it to pass that flag.
8. Add pcre_info(), to get the number of identifying subpatterns, the stored
options, and the first character, if set.
9. Recognize C+ or C{n,m} where n >= 1 as providing a fixed starting character.
Version 0.91 10-Sep-97
----------------------
1. PCRE was failing to diagnose unlimited repeats of subpatterns that could
match the empty string as in /(a*)*/. It was looping and ultimately crashing.
2. PCRE was looping on encountering an indefinitely repeated back reference to
a subpattern that had matched an empty string, e.g. /(a|)\1*/. It now does what
Perl does - treats the match as successful.
****
privoxy-3.0.21-stable/./pcre/pcre.in 000640 001751 001751 00000006117 10546014100 016170 0 ustar 00fk fk 000000 000000 /*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/* Copyright (c) 1997-2000 University of Cambridge */
#ifndef _PCRE_H
#define _PCRE_H
/* The file pcre.h is build by "configure". Do not edit it; instead
make changes to pcre.in. */
#define PCRE_MAJOR @PCRE_MAJOR@
#define PCRE_MINOR @PCRE_MINOR@
#define PCRE_DATE @PCRE_DATE@
/* Win32 uses DLL by default */
#ifdef _WIN32
# ifdef STATIC_PCRE
# define PCRE_DL_IMPORT
# else
# define PCRE_DL_IMPORT __declspec(dllimport)
# endif
#else
# define PCRE_DL_IMPORT
#endif
/* Have to include stdlib.h in order to ensure that size_t is defined;
it is needed here for malloc. */
#include
/* Allow for C++ users */
#ifdef __cplusplus
extern "C" {
#endif
/* Options */
#define PCRE_CASELESS 0x0001
#define PCRE_MULTILINE 0x0002
#define PCRE_DOTALL 0x0004
#define PCRE_EXTENDED 0x0008
#define PCRE_ANCHORED 0x0010
#define PCRE_DOLLAR_ENDONLY 0x0020
#define PCRE_EXTRA 0x0040
#define PCRE_NOTBOL 0x0080
#define PCRE_NOTEOL 0x0100
#define PCRE_UNGREEDY 0x0200
#define PCRE_NOTEMPTY 0x0400
#define PCRE_UTF8 0x0800
/* Exec-time and get-time error codes */
#define PCRE_ERROR_NOMATCH (-1)
#define PCRE_ERROR_NULL (-2)
#define PCRE_ERROR_BADOPTION (-3)
#define PCRE_ERROR_BADMAGIC (-4)
#define PCRE_ERROR_UNKNOWN_NODE (-5)
#define PCRE_ERROR_NOMEMORY (-6)
#define PCRE_ERROR_NOSUBSTRING (-7)
/* Request types for pcre_fullinfo() */
#define PCRE_INFO_OPTIONS 0
#define PCRE_INFO_SIZE 1
#define PCRE_INFO_CAPTURECOUNT 2
#define PCRE_INFO_BACKREFMAX 3
#define PCRE_INFO_FIRSTCHAR 4
#define PCRE_INFO_FIRSTTABLE 5
#define PCRE_INFO_LASTLITERAL 6
/* Types */
typedef void pcre;
typedef void pcre_extra;
/* Store get and free functions. These can be set to alternative malloc/free
functions if required. Some magic is required for Win32 DLL; it is null on
other OS. */
PCRE_DL_IMPORT extern void *(*pcre_malloc)(size_t);
PCRE_DL_IMPORT extern void (*pcre_free)(void *);
#undef PCRE_DL_IMPORT
/* Functions */
extern pcre *pcre_compile(const char *, int, const char **, int *,
const unsigned char *);
extern int pcre_copy_substring(const char *, int *, int, int, char *, int);
extern int pcre_exec(const pcre *, const pcre_extra *, const char *,
int, int, int, int *, int);
extern void pcre_free_substring(const char *);
extern void pcre_free_substring_list(const char **);
extern int pcre_get_substring(const char *, int *, int, int, const char **);
extern int pcre_get_substring_list(const char *, int *, int, const char ***);
extern int pcre_info(const pcre *, int *, int *);
extern int pcre_fullinfo(const pcre *, const pcre_extra *, int, void *);
extern unsigned const char *pcre_maketables(void);
extern pcre_extra *pcre_study(const pcre *, int, const char **);
extern const char *pcre_version(void);
#ifdef __cplusplus
} /* extern "C" */
#endif
#endif /* End of pcre.h */
privoxy-3.0.21-stable/./pcre/study.c 000640 001751 001751 00000025172 10546014100 016225 0 ustar 00fk fk 000000 000000 /*************************************************
* Perl-Compatible Regular Expressions *
*************************************************/
/*
This is a library of functions to support regular expressions whose syntax
and semantics are as close as possible to those of the Perl 5 language. See
the file Tech.Notes for some information on the internals.
Written by: Philip Hazel
Copyright (c) 1997-2000 University of Cambridge
-----------------------------------------------------------------------------
Permission is granted to anyone to use this software for any purpose on any
computer system, and to redistribute it freely, subject to the following
restrictions:
1. This software is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
2. The origin of this software must not be misrepresented, either by
explicit claim or by omission.
3. Altered versions must be plainly marked as such, and must not be
misrepresented as being the original software.
4. If PCRE is embedded in any software that is released under the GNU
General Purpose Licence (GPL), then the terms of that licence shall
supersede any condition above with which it is incompatible.
-----------------------------------------------------------------------------
*/
/* Include the internals header, which itself includes Standard C headers plus
the external pcre header. */
#include "internal.h"
/*************************************************
* Set a bit and maybe its alternate case *
*************************************************/
/* Given a character, set its bit in the table, and also the bit for the other
version of a letter if we are caseless.
Arguments:
start_bits points to the bit map
c is the character
caseless the caseless flag
cd the block with char table pointers
Returns: nothing
*/
static void
set_bit(uschar *start_bits, int c, BOOL caseless, compile_data *cd)
{
start_bits[c/8] |= (1 << (c&7));
if (caseless && (cd->ctypes[c] & ctype_letter) != 0)
start_bits[cd->fcc[c]/8] |= (1 << (cd->fcc[c]&7));
}
/*************************************************
* Create bitmap of starting chars *
*************************************************/
/* This function scans a compiled unanchored expression and attempts to build a
bitmap of the set of initial characters. If it can't, it returns FALSE. As time
goes by, we may be able to get more clever at doing this.
Arguments:
code points to an expression
start_bits points to a 32-byte table, initialized to 0
caseless the current state of the caseless flag
cd the block with char table pointers
Returns: TRUE if table built, FALSE otherwise
*/
static BOOL
set_start_bits(const uschar *code, uschar *start_bits, BOOL caseless,
compile_data *cd)
{
register int c;
/* This next statement and the later reference to dummy are here in order to
trick the optimizer of the IBM C compiler for OS/2 into generating correct
code. Apparently IBM isn't going to fix the problem, and we would rather not
disable optimization (in this module it actually makes a big difference, and
the pcre module can use all the optimization it can get). */
volatile int dummy;
do
{
const uschar *tcode = code + 3;
BOOL try_next = TRUE;
while (try_next)
{
try_next = FALSE;
/* If a branch starts with a bracket or a positive lookahead assertion,
recurse to set bits from within them. That's all for this branch. */
if ((int)*tcode >= OP_BRA || *tcode == OP_ASSERT)
{
if (!set_start_bits(tcode, start_bits, caseless, cd))
return FALSE;
}
else switch(*tcode)
{
default:
return FALSE;
/* Skip over lookbehind and negative lookahead assertions */
case OP_ASSERT_NOT:
case OP_ASSERTBACK:
case OP_ASSERTBACK_NOT:
try_next = TRUE;
do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
tcode += 3;
break;
/* Skip over an option setting, changing the caseless flag */
case OP_OPT:
caseless = (tcode[1] & PCRE_CASELESS) != 0;
tcode += 2;
try_next = TRUE;
break;
/* BRAZERO does the bracket, but carries on. */
case OP_BRAZERO:
case OP_BRAMINZERO:
if (!set_start_bits(++tcode, start_bits, caseless, cd))
return FALSE;
dummy = 1;
do tcode += (tcode[1] << 8) + tcode[2]; while (*tcode == OP_ALT);
tcode += 3;
try_next = TRUE;
break;
/* Single-char * or ? sets the bit and tries the next item */
case OP_STAR:
case OP_MINSTAR:
case OP_QUERY:
case OP_MINQUERY:
set_bit(start_bits, tcode[1], caseless, cd);
tcode += 2;
try_next = TRUE;
break;
/* Single-char upto sets the bit and tries the next */
case OP_UPTO:
case OP_MINUPTO:
set_bit(start_bits, tcode[3], caseless, cd);
tcode += 4;
try_next = TRUE;
break;
/* At least one single char sets the bit and stops */
case OP_EXACT: /* Fall through */
tcode++;
case OP_CHARS: /* Fall through */
tcode++;
case OP_PLUS:
case OP_MINPLUS:
set_bit(start_bits, tcode[1], caseless, cd);
break;
/* Single character type sets the bits and stops */
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
break;
case OP_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_space];
break;
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_space];
break;
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
break;
/* One or more character type fudges the pointer and restarts, knowing
it will hit a single character type and stop there. */
case OP_TYPEPLUS:
case OP_TYPEMINPLUS:
tcode++;
try_next = TRUE;
break;
case OP_TYPEEXACT:
tcode += 3;
try_next = TRUE;
break;
/* Zero or more repeats of character types set the bits and then
try again. */
case OP_TYPEUPTO:
case OP_TYPEMINUPTO:
tcode += 2; /* Fall through */
case OP_TYPESTAR:
case OP_TYPEMINSTAR:
case OP_TYPEQUERY:
case OP_TYPEMINQUERY:
switch(tcode[1])
{
case OP_NOT_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_digit];
break;
case OP_DIGIT:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_digit];
break;
case OP_NOT_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_space];
break;
case OP_WHITESPACE:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_space];
break;
case OP_NOT_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= ~cd->cbits[c+cbit_word];
break;
case OP_WORDCHAR:
for (c = 0; c < 32; c++)
start_bits[c] |= cd->cbits[c+cbit_word];
break;
}
tcode += 2;
try_next = TRUE;
break;
/* Character class: set the bits and either carry on or not,
according to the repeat count. */
case OP_CLASS:
{
tcode++;
for (c = 0; c < 32; c++) start_bits[c] |= tcode[c];
tcode += 32;
switch (*tcode)
{
case OP_CRSTAR:
case OP_CRMINSTAR:
case OP_CRQUERY:
case OP_CRMINQUERY:
tcode++;
try_next = TRUE;
break;
case OP_CRRANGE:
case OP_CRMINRANGE:
if (((tcode[1] << 8) + tcode[2]) == 0)
{
tcode += 5;
try_next = TRUE;
}
break;
}
}
break; /* End of class handling */
} /* End of switch */
} /* End of try_next loop */
code += (code[1] << 8) + code[2]; /* Advance to next branch */
}
while (*code == OP_ALT);
return TRUE;
}
/*************************************************
* Study a compiled expression *
*************************************************/
/* This function is handed a compiled expression that it must study to produce
information that will speed up the matching. It returns a pcre_extra block
which then gets handed back to pcre_exec().
Arguments:
re points to the compiled expression
options contains option bits
errorptr points to where to place error messages;
set NULL unless error
Returns: pointer to a pcre_extra block,
NULL on error or if no optimization possible
*/
pcre_extra *
pcre_study(const pcre *external_re, int options, const char **errorptr)
{
uschar start_bits[32];
real_pcre_extra *extra;
const real_pcre *re = (const real_pcre *)external_re;
compile_data compile_block;
*errorptr = NULL;
if (re == NULL || re->magic_number != MAGIC_NUMBER)
{
*errorptr = "argument is not a compiled regular expression";
return NULL;
}
if ((options & ~PUBLIC_STUDY_OPTIONS) != 0)
{
*errorptr = "unknown or incorrect option bit(s) set";
return NULL;
}
/* For an anchored pattern, or an unchored pattern that has a first char, or a
multiline pattern that matches only at "line starts", no further processing at
present. */
if ((re->options & (PCRE_ANCHORED|PCRE_FIRSTSET|PCRE_STARTLINE)) != 0)
return NULL;
/* Set the character tables in the block which is passed around */
compile_block.lcc = re->tables + lcc_offset;
compile_block.fcc = re->tables + fcc_offset;
compile_block.cbits = re->tables + cbits_offset;
compile_block.ctypes = re->tables + ctypes_offset;
/* See if we can find a fixed set of initial characters for the pattern. */
memset(start_bits, 0, 32 * sizeof(uschar));
if (!set_start_bits(re->code, start_bits, (re->options & PCRE_CASELESS) != 0,
&compile_block)) return NULL;
/* Get an "extra" block and put the information therein. */
extra = (real_pcre_extra *)(pcre_malloc)(sizeof(real_pcre_extra));
if (extra == NULL)
{
*errorptr = "failed to get memory";
return NULL;
}
extra->options = PCRE_STUDY_MAPPED;
memcpy(extra->start_bits, start_bits, sizeof(start_bits));
return (pcre_extra *)extra;
}
/* End of study.c */
privoxy-3.0.21-stable/./pcre/pcre-config.in 000640 001751 001751 00000002105 10546014100 017424 0 ustar 00fk fk 000000 000000 #!/bin/sh
prefix=@prefix@
exec_prefix=@exec_prefix@
exec_prefix_set=no
usage="\
Usage: pcre-config [--prefix] [--exec-prefix] [--version] [--libs] [--libs-posix] [--cflags] [--cflags-posix]"
if test $# -eq 0; then
echo "${usage}" 1>&2
exit 1
fi
while test $# -gt 0; do
case "$1" in
-*=*) optarg=`echo "$1" | sed 's/[-_a-zA-Z0-9]*=//'` ;;
*) optarg= ;;
esac
case $1 in
--prefix=*)
prefix=$optarg
if test $exec_prefix_set = no ; then
exec_prefix=$optarg
fi
;;
--prefix)
echo $prefix
;;
--exec-prefix=*)
exec_prefix=$optarg
exec_prefix_set=yes
;;
--exec-prefix)
echo $exec_prefix
;;
--version)
echo @PCRE_VERSION@
;;
--cflags | --cflags-posix)
if test @includedir@ != /usr/include ; then
includes=-I@includedir@
fi
echo $includes
;;
--libs-posix)
echo -L@libdir@ -lpcreposix -lpcre
;;
--libs)
echo -L@libdir@ -lpcre
;;
*)
echo "${usage}" 1>&2
exit 1
;;
esac
shift
done
privoxy-3.0.21-stable/./pcre/install-sh 000640 001751 001751 00000012736 10546014100 016714 0 ustar 00fk fk 000000 000000 #!/bin/sh
#
# install - install a program, script, or datafile
# This comes from X11R5 (mit/util/scripts/install.sh).
#
# Copyright 1991 by the Massachusetts Institute of Technology
#
# Permission to use, copy, modify, distribute, and sell this software and its
# documentation for any purpose is hereby granted without fee, provided that
# the above copyright notice appear in all copies and that both that
# copyright notice and this permission notice appear in supporting
# documentation, and that the name of M.I.T. not be used in advertising or
# publicity pertaining to distribution of the software without specific,
# written prior permission. M.I.T. makes no representations about the
# suitability of this software for any purpose. It is provided "as is"
# without express or implied warranty.
#
# Calling this script install-sh is preferred over install.sh, to prevent
# `make' implicit rules from creating a file called install from it
# when there is no Makefile.
#
# This script is compatible with the BSD install script, but was written
# from scratch. It can only install one file at a time, a restriction
# shared with many OS's install programs.
# set DOITPROG to echo to test this script
# Don't use :- since 4.3BSD and earlier shells don't like it.
doit="${DOITPROG-}"
# put in absolute paths if you don't have them in your path; or use env. vars.
mvprog="${MVPROG-mv}"
cpprog="${CPPROG-cp}"
chmodprog="${CHMODPROG-chmod}"
chownprog="${CHOWNPROG-chown}"
chgrpprog="${CHGRPPROG-chgrp}"
stripprog="${STRIPPROG-strip}"
rmprog="${RMPROG-rm}"
mkdirprog="${MKDIRPROG-mkdir}"
transformbasename=""
transform_arg=""
instcmd="$mvprog"
chmodcmd="$chmodprog 0755"
chowncmd=""
chgrpcmd=""
stripcmd=""
rmcmd="$rmprog -f"
mvcmd="$mvprog"
src=""
dst=""
dir_arg=""
while [ x"$1" != x ]; do
case $1 in
-c) instcmd="$cpprog"
shift
continue;;
-d) dir_arg=true
shift
continue;;
-m) chmodcmd="$chmodprog $2"
shift
shift
continue;;
-o) chowncmd="$chownprog $2"
shift
shift
continue;;
-g) chgrpcmd="$chgrpprog $2"
shift
shift
continue;;
-s) stripcmd="$stripprog"
shift
continue;;
-t=*) transformarg=`echo $1 | sed 's/-t=//'`
shift
continue;;
-b=*) transformbasename=`echo $1 | sed 's/-b=//'`
shift
continue;;
*) if [ x"$src" = x ]
then
src=$1
else
# this colon is to work around a 386BSD /bin/sh bug
:
dst=$1
fi
shift
continue;;
esac
done
if [ x"$src" = x ]
then
echo "install: no input file specified"
exit 1
else
true
fi
if [ x"$dir_arg" != x ]; then
dst=$src
src=""
if [ -d $dst ]; then
instcmd=:
chmodcmd=""
else
instcmd=mkdir
fi
else
# Waiting for this to be detected by the "$instcmd $src $dsttmp" command
# might cause directories to be created, which would be especially bad
# if $src (and thus $dsttmp) contains '*'.
if [ -f $src -o -d $src ]
then
true
else
echo "install: $src does not exist"
exit 1
fi
if [ x"$dst" = x ]
then
echo "install: no destination specified"
exit 1
else
true
fi
# If destination is a directory, append the input filename; if your system
# does not like double slashes in filenames, you may need to add some logic
if [ -d $dst ]
then
dst="$dst"/`basename $src`
else
true
fi
fi
## this sed command emulates the dirname command
dstdir=`echo $dst | sed -e 's,[^/]*$,,;s,/$,,;s,^$,.,'`
# Make sure that the destination directory exists.
# this part is taken from Noah Friedman's mkinstalldirs script
# Skip lots of stat calls in the usual case.
if [ ! -d "$dstdir" ]; then
defaultIFS='
'
IFS="${IFS-${defaultIFS}}"
oIFS="${IFS}"
# Some sh's can't handle IFS=/ for some reason.
IFS='%'
set - `echo ${dstdir} | sed -e 's@/@%@g' -e 's@^%@/@'`
IFS="${oIFS}"
pathcomp=''
while [ $# -ne 0 ] ; do
pathcomp="${pathcomp}${1}"
shift
if [ ! -d "${pathcomp}" ] ;
then
$mkdirprog "${pathcomp}"
else
true
fi
pathcomp="${pathcomp}/"
done
fi
if [ x"$dir_arg" != x ]
then
$doit $instcmd $dst &&
if [ x"$chowncmd" != x ]; then $doit $chowncmd $dst; else true ; fi &&
if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dst; else true ; fi &&
if [ x"$stripcmd" != x ]; then $doit $stripcmd $dst; else true ; fi &&
if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dst; else true ; fi
else
# If we're going to rename the final executable, determine the name now.
if [ x"$transformarg" = x ]
then
dstfile=`basename $dst`
else
dstfile=`basename $dst $transformbasename |
sed $transformarg`$transformbasename
fi
# don't allow the sed command to completely eliminate the filename
if [ x"$dstfile" = x ]
then
dstfile=`basename $dst`
else
true
fi
# Make a temp file name in the proper directory.
dsttmp=$dstdir/#inst.$$#
# Move or copy the file name to the temp name
$doit $instcmd $src $dsttmp &&
trap "rm -f ${dsttmp}" 0 &&
# and set any options; do chmod last to preserve setuid bits
# If any of these fail, we abort the whole thing. If we want to
# ignore errors from any of these, just make sure not to ignore
# errors from the above "$doit $instcmd $src $dsttmp" command.
if [ x"$chowncmd" != x ]; then $doit $chowncmd $dsttmp; else true;fi &&
if [ x"$chgrpcmd" != x ]; then $doit $chgrpcmd $dsttmp; else true;fi &&
if [ x"$stripcmd" != x ]; then $doit $stripcmd $dsttmp; else true;fi &&
if [ x"$chmodcmd" != x ]; then $doit $chmodcmd $dsttmp; else true;fi &&
# Now rename the file to the real destination.
$doit $rmcmd -f $dstdir/$dstfile &&
$doit $mvcmd $dsttmp $dstdir/$dstfile
fi &&
exit 0
privoxy-3.0.21-stable/./pcre/dftables 000750 001751 001751 00000054605 10546014100 016425 0 ustar 00fk fk 000000 000000 ELF ì‰4 G 4 ( ! 4 4€4€ Ô Ô€Ô€ € €ö7 ö7 8 È È # ¸8 ¸È¸ÈÐ Ð /libexec/ld-elf.so.1 % , ) &