/* "rgx.c" regular expression matching using C regex library.
* Copyright (C) 1995, 1996, 1997 Free Software Foundation, Inc.
*
* This program is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as
* published by the Free Software Foundation, either version 3 of the
* License, or (at your option) any later version.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this program. If not, see
* .
*/
/* Author: Aubrey Jaffer */
#include "scm.h"
#ifdef __FreeBSD__
# include "gnuregex.h"
#else
# include "regex.h"
#endif
#include
/* added by Denys Duchier: for bcopy */
#ifdef sun
#include
#endif
static char rcsid[] =
"$Id: rgx.c,v 1.19 2008/01/31 03:32:33 jaffer Exp $";
#ifdef HAVE_ALLOCA
# include
# define ALLOCA_PROTECT typedef int foobazzz
# define ALLOCA alloca
#else
# define ALLOCA_PROTECT SCM alloca_protect=EOL
# define ALLOCA(size) \
(alloca_protect=cons(makstr((long)(size)), alloca_protect), \
(void *)CDR(CAR(alloca_protect)))
#endif
#ifdef _GNU_SOURCE
/* following two lines stolen from GNU regex.c */
# define CHAR_SET_SIZE 256
# define ISUPPER(c) (isascii (c) && isupper (c))
#endif
/* forward function defs */
SCM lregsearch();
SCM lregsearchv();
/* Posix regexp bindings. */
static char s_regex[] = "regex";
static char s_regcomp[] = "regcomp", s_regerror[] = "regerror";
static char s_regexec[] = "regexec", s_regmatp[] = "regmatch?";
static char s_regsearch[] = "regsearch", s_regmatch[] = "regmatch";
static char s_regsearchv[] = "regsearchv", s_regmatchv[] = "regmatchv";
static char s_stringsplit[] = "string-split";
static char s_stringsplitv[] = "string-splitv";
static char s_stringedit[] = "string-edit";
#define s_error &s_regerror[3]
#define RGX_INFO(obj) ((regex_info*)CDR(obj))
#define RGX_PATTERN(obj) (((regex_info*)CDR(obj))->pattern)
#define RGX(obj) (&((regex_info*)CDR(obj))->rgx)
#ifndef _GNU_SOURCE
# define RGX2(obj) (&((regex_info*)CDR(obj))->rgx_anchored)
#endif
#define FIXUP_REGEXP(prog) \
{ \
if (STRINGP(prog)) \
prog = lregcomp(prog, UNDEFINED); \
if (NIMP(prog) && CONSP(prog) && STRINGP(CAR(prog)) && \
NIMP(CDR(prog)) && CONSP(CDR(prog)) && STRINGP(CAR(CDR(prog)))) \
prog = lregcomp(CAR(prog), CAR(CDR(prog))); \
}
typedef struct regex_info {
SCM pattern; /* string we compiled to create our reg exp */
regex_t rgx;
#ifndef _GNU_SOURCE
int options; /* for anchored pattern when matching */
regex_t rgx_anchored;
#endif
} regex_info;
sizet fregex(ptr)
CELLPTR ptr;
{
regfree(RGX((SCM)ptr));
#ifndef _GNU_SOURCE
/* options are null => we compiled the anchored pattern */
if (RGX_INFO((SCM)ptr)->options==0)
regfree(RGX2((SCM)ptr));
#endif
must_free(CHARS((SCM)ptr), (sizet)LENGTH((SCM)ptr));
return 0;
}
int prinregex(exp, port, writing)
SCM exp; SCM port; int writing;
{
lputs("#', port);
return 1;
}
SCM markregex(ptr)
SCM ptr;
{
SETGC8MARK(RGX_PATTERN(ptr));
return BOOL_F;
}
int tc16_rgx;
static smobfuns rgxsmob = {markregex, fregex, prinregex};
SCM lregerror(scode)
SCM scode;
{
int code;
/* added by Denys Duchier: conditional declaration */
#ifdef __REGEXP_LIBRARY_H__
int len;
#endif
SCM str;
ASRTER(INUMP(scode), scode, ARG1, s_regerror);
code = INUM(scode);
if (code < 0)
return makfromstr("Invalid code", sizeof("Invalid code")-1);
/* XXX - is regerror posix or not? */
#ifdef __REGEXP_LIBRARY_H__
/* XXX - gnu regexp doesn't use the re parameter, so we will
ignore it in a very untidy way. */
len = regerror(code, 0L, 0L, 0);
str = makstr(len-1);
regerror(code, 0L, CHARS(str), len);
#else
str = makfromstr(s_error, (sizet)5);
#endif
return str;
}
SCM lregcomp(pattern, flags)
SCM pattern, flags;
{
SCM z;
int i, options;
regex_t *prog;
regex_info *info;
char *flagchars;
#ifdef _GNU_SOURCE
int fastmap = 0;
int ignore_case = 0;
char *err_msg;
#endif
ASRTER(NIMP(pattern) && STRINGP(pattern), pattern, ARG1, s_regcomp);
ASRTER(UNBNDP(flags) || (NIMP(flags) && STRINGP(flags)),
flags, ARG2, s_regcomp);
DEFER_INTS;
z = must_malloc_cell((long)sizeof(regex_info), (SCM)tc16_rgx, s_regex);
scm_protect_temp(&z);
info=(regex_info*)CHARS(z);
prog = &(info->rgx);
#ifdef __REGEXP_LIBRARY_H__
for (i=sizeof(regex_t);i--;((char *)prog)[i] = 0);
# ifndef _GNU_SOURCE
{
regex_t *prog2;
prog2 = &(info->rgx_anchored);
for (i=sizeof(regex_t);i--;((char *)prog2)[i] = 0);
}
# endif
#endif
ALLOW_INTS;
info->pattern = pattern;
#ifdef _GNU_SOURCE
options = RE_SYNTAX_POSIX_EXTENDED;
#else
options = REG_EXTENDED;
#endif
if (!UNBNDP(flags)) {
flagchars = CHARS(flags);
for (i=0; ifastmap = must_malloc(CHAR_SET_SIZE, s_regex);
else
prog->fastmap = 0;
if (ignore_case) {
prog->translate = must_malloc(CHAR_SET_SIZE, s_regex);
for (i = 0; i < CHAR_SET_SIZE; i++)
prog->translate[i] = ISUPPER (i) ? tolower (i) : i;
}
else
prog->translate = 0;
prog->buffer = 0;
prog->allocated = 0;
re_set_syntax(options);
err_msg = (char *)re_compile_pattern(CHARS(pattern), LENGTH(pattern), prog);
ALLOW_INTS;
prog->regs_allocated = REGS_FIXED;
/* if error, compile using regcomp to get the error number */
if (err_msg) {
int i;
char *tmppat;
SCM protect;
/* Fixup in case pattern has null characters */
tmppat = CHARS(protect=makstr(LENGTH(pattern)));
bcopy(CHARS(pattern), tmppat, LENGTH(pattern));
for (i=0; ioptions = options;
i = regcomp(prog, CHARS(pattern), options);
if (i) z = MAKINUM(i);
#endif
return z;
}
SCM lregexec(prog, str)
SCM prog, str;
{
ALLOCA_PROTECT;
FIXUP_REGEXP(prog);
ASRTER(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_regexec);
ASRTER(NIMP(str) && STRINGP(str), str, ARG2, s_regexec);
#ifdef _GNU_SOURCE
return lregsearchv(prog, str, EOL);
#else /* not _GNU_SOURCE */
{
size_t nsub;
SCM ans;
regmatch_t *pm;
int flags = 0; /* XXX - optional arg? */
nsub = RGX(prog)->re_nsub + 1; /* XXX - is this posix? */
pm = ALLOCA(nsub * sizeof(regmatch_t));
if (regexec(RGX(prog), CHARS(str), nsub, pm, flags) != 0)
ans = BOOL_F;
else {
ans = make_vector(MAKINUM(2L * nsub), MAKINUM(-1L));
while (nsub--) {
VELTS(ans)[2*nsub+0] = MAKINUM(pm[nsub].rm_so);
VELTS(ans)[2*nsub+1] = MAKINUM(pm[nsub].rm_eo);
}
}
return ans;
}
#endif /* _GNU_SOURCE */
}
SCM lregmatp(prog, str)
SCM prog, str;
{
FIXUP_REGEXP(prog);
ASRTER(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_regmatp);
ASRTER(NIMP(str) && STRINGP(str), str, ARG2, s_regmatp);
#ifdef _GNU_SOURCE
return (lregsearch(prog, str, EOL)==BOOL_F)?BOOL_F:BOOL_T;
#else /* not _GNU_SOURCE */
{
int flags = 0; /* XXX - optional arg? */
flags = regexec(RGX(prog), CHARS(str), 0, 0, flags);
if (!flags) return BOOL_T;
if (REG_NOMATCH!=flags) wta(MAKINUM(flags), s_error, s_regmatp);
return BOOL_F;
}
#endif
}
#define SCALAR 0
#define VECTOR 1
#define MATCH 0
#define SEARCH 1
SCM lregsearchmatch(prog, str, args, search, vector)
SCM prog, str, args;
int vector, search;
{
int len = ilength(args);
int start, size, nsub;
SCM matches;
ALLOCA_PROTECT;
FIXUP_REGEXP(prog);
ASRTER(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_regsearch);
ASRTER(NIMP(str) && STRINGP(str), str, ARG2, s_regsearch);
ASRTER(len<=2, args, WNA, s_regsearch);
ASRTER((len<1)||(INUMP(CAR(args))), CAR(args), ARG3, s_regsearch);
ASRTER((len<2)||(INUMP(CAR(CDR(args)))), CAR(CDR(args)), ARG4, s_regsearch);
start = (len>=1)?(INUM(CAR(args))):0;
size = (len>=2)?(INUM(CAR(CDR(args)))):LENGTH(str);
#ifdef _GNU_SOURCE
{
int ret, dir=1;
struct re_registers regs, *pregs=0;
if (search && start<0)
start *= -1, dir = -1;
if (vector) {
pregs = ®s;
nsub = RGX(prog)->re_nsub + 1;
regs.num_regs = nsub;
regs.start = ALLOCA(nsub * sizeof(regoff_t));
regs.end = ALLOCA(nsub * sizeof(regoff_t));
}
if (search)
ret = re_search(RGX(prog), CHARS(str), size, start, dir*size, pregs);
else
ret = re_match(RGX(prog), CHARS(str), size, start, pregs);
if (ret < 0)
return BOOL_F;
if (!vector)
return MAKINUM(ret);
matches = make_vector(MAKINUM(2L * nsub), MAKINUM(-1L));
while (nsub--) {
VELTS(matches)[2*nsub+0] = MAKINUM(regs.start[nsub]);
VELTS(matches)[2*nsub+1] = MAKINUM(regs.end[nsub]);
}
return matches;
}
#else /* not _GNU_SOURCE */
{
regex_t *regexp;
regmatch_t *pm;
char *search_string;
if (size > LENGTH(str))
size = LENGTH(str);
if (start<0 || start >= size)
return BOOL_F;
if (size < LENGTH(str)) {
search_string = ALLOCA(size-start+1);
bcopy(CHARS(str)+start, search_string, size-start);
search_string[size-start] = 0;
} else
search_string = CHARS(str)+start;
nsub = RGX(prog)->re_nsub + 1;
pm = ALLOCA(nsub * sizeof(regmatch_t));
if (search)
regexp = RGX(prog);
else {
/* doing a match */
if (RGX_INFO(prog)->options) {
/* strlen & strcpy OK, posix patterns are null terminated */
char *pattern;
pattern = ALLOCA(strlen(CHARS(RGX_PATTERN(prog)))+2);
pattern[0] = '^';
strcpy(pattern+1, CHARS(RGX_PATTERN(prog)));
regcomp(RGX2(prog), pattern, RGX_INFO(prog)->options);
RGX_INFO(prog)->options = 0;
}
regexp = RGX2(prog);
}
if (regexec(regexp, search_string, nsub, pm, 0) != 0)
return BOOL_F;
if (vector) {
matches = make_vector(MAKINUM(2L * nsub), MAKINUM(-1L));
while (nsub--) {
VELTS(matches)[2*nsub+0] = MAKINUM(pm[nsub].rm_so + start);
VELTS(matches)[2*nsub+1] = MAKINUM(pm[nsub].rm_eo + start);
}
return matches;
}
if (search)
return MAKINUM(pm[0].rm_so + start);
else
return MAKINUM(pm[0].rm_eo - pm[0].rm_so);
}
#endif /* _GNU_SOURCE */
}
SCM lregsearch(prog, str, args)
SCM prog, str, args;
{
return lregsearchmatch(prog, str, args, SEARCH, SCALAR);
}
SCM lregsearchv(prog, str, args)
SCM prog, str, args;
{
return lregsearchmatch(prog, str, args, SEARCH, VECTOR);
}
SCM lregmatch(prog, str, args)
SCM prog, str, args;
{
return lregsearchmatch(prog, str, args, MATCH, SCALAR);
}
SCM lregmatchv(prog, str, args)
SCM prog, str, args;
{
return lregsearchmatch(prog, str, args, MATCH, VECTOR);
}
SCM stringsplitutil(prog, str, vector)
SCM prog, str;
int vector;
{
int anchor, match_start, match_end, num_substrings, num_elements;
int search_base;
SCM next_break, substrings, ret;
SCM st_start, st_end;
FIXUP_REGEXP(prog);
ASRTER(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_stringsplit);
ASRTER(NIMP(str) && STRINGP(str), str, ARG2, s_stringsplit);
substrings = EOL;
anchor = 0;
search_base = 0;
num_substrings = 0;
next_break = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL));
while (next_break != BOOL_F) {
match_start = INUM(VELTS(next_break)[0]);
match_end = INUM(VELTS(next_break)[1]);
if (match_start < match_end) {
substrings=cons2(MAKINUM(anchor), MAKINUM(match_start), substrings);
anchor = match_end;
num_substrings++;
}
search_base = ((match_end>search_base)?match_end:(search_base+1));
next_break = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL));
}
/* get that tail bit */
if (anchor < LENGTH(str)) {
substrings = cons2(MAKINUM(anchor), MAKINUM(LENGTH(str)), substrings);
num_substrings++;
}
num_elements = vector?(2*num_substrings):num_substrings;
ret = make_vector(MAKINUM(num_elements), EOL);
while (num_substrings--) {
st_start = CAR(substrings);
st_end = CAR(CDR(substrings));
if (vector) {
VELTS(ret)[num_substrings*2+0] = st_start;
VELTS(ret)[num_substrings*2+1] = st_end;
} else
VELTS(ret)[num_substrings] = substring(str, st_start, st_end);
substrings = CDR(CDR(substrings));
}
return ret;
}
SCM lstringsplit(prog, str)
SCM prog, str;
{
return stringsplitutil(prog, str, SCALAR);
}
SCM lstringsplitv(prog, str)
SCM prog, str;
{
return stringsplitutil(prog, str, VECTOR);
}
typedef struct _item {
struct _item *next;
char *string;
int start;
int end;
} *editItem;
#define PUSH(list, string_parm, start_parm, end_parm) \
{ \
editItem item; \
\
item = ALLOCA(sizeof(*item)); \
item->next = list; \
list = item; \
item->string = string_parm; \
item->start = start_parm; \
item->end = end_parm; \
}
/* (string-edit []) */
SCM lstringedit(prog, editspec, args)
SCM prog, editspec, args;
{
int match_start, match_end, search_base, editcount;
int total_len;
int i, args_len, anchor, maxsubnum;
int backslash;
char *ptr;
editItem editlist, substrings, edit;
SCM str, count, next_edit;
SCM result;
ALLOCA_PROTECT;
args_len = ilength(args);
FIXUP_REGEXP(prog);
ASRTER(NIMP(prog) && tc16_rgx==CAR(prog), prog, ARG1, s_stringedit);
ASRTER(NIMP(editspec) && STRINGP(editspec), editspec, ARG2, s_stringedit);
ASRTER((args_len==1)||(args_len==2), args, WNA, s_stringedit);
str = CAR(args);
ASRTER(NIMP(str)&&STRINGP(str), str, ARG3, s_stringedit);
if (args_len==2) {
count = CAR(CDR(args));
ASRTER(INUMP(count)||(count==BOOL_T), count, ARG4, s_stringedit);
} else
count = MAKINUM(1);
/* process the editspec - break it into a list of dotted pairs
* of integers for substrings to be inserted and
* integers representing matched subexpressions that
* should be inserted.
*/
maxsubnum = RGX(prog)->re_nsub;
anchor = 0;
backslash = 0;
editlist = 0;
ptr = CHARS(editspec);
for (i=0; i='0') && (ptr[i] <='9') &&
((ptr[i]-'0')<=maxsubnum))
{
if ((i-1)>anchor)
PUSH(editlist, CHARS(editspec), anchor, i-1);
PUSH(editlist, CHARS(editspec), ptr[i]-'0', -1);
anchor = i+1;
}
backslash = (ptr[i] == '\\')?1:0;
}
if (anchor < LENGTH(editspec))
PUSH(editlist, CHARS(editspec), anchor, LENGTH(editspec));
/* now, reverse the list of edit items */
{
editItem prev, cur, next;
for (prev=0, cur=editlist; cur; prev=cur, cur=next) {
next = cur->next;
cur->next = prev;
}
editlist = prev;
}
anchor = 0;
search_base = 0;
editcount = 0;
substrings = 0;
next_edit = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL));
while (next_edit != BOOL_F) {
if (INUMP(count) && (editcount==INUM(count)))
break;
match_start = INUM(VELTS(next_edit)[0]);
match_end = INUM(VELTS(next_edit)[1]);
if (match_start < match_end) {
PUSH(substrings, CHARS(str), anchor, match_start);
anchor = match_end;
}
for (edit=editlist; edit; edit=edit->next) {
if (edit->end == -1) {
/* A backslash number in the original editspec */
PUSH(substrings, CHARS(str),
INUM(VELTS(next_edit)[edit->start*2+0]),
INUM(VELTS(next_edit)[edit->start*2+1]));
} else
/* normal string in the editspec */
PUSH(substrings, edit->string, edit->start, edit->end);
}
editcount++;
search_base = ((match_end>search_base)?match_end:(search_base+1));
next_edit = lregsearchv(prog, str, cons(MAKINUM(search_base), EOL));
}
/* get that tail bit */
if (anchor < LENGTH(str))
PUSH(substrings, CHARS(str), anchor, LENGTH(str));
/* assemble the result string */
for (edit=substrings, total_len=0; edit; edit=edit->next)
total_len += (edit->end - edit->start);
result = makstr(total_len);
ptr = CHARS(result) + total_len; /* point at the null at the end */
for (edit=substrings; edit; edit=edit->next) {
ptr -= (edit->end - edit->start);
bcopy(edit->string + edit->start, ptr, edit->end - edit->start);
}
return result;
}
#undef PUSH
void init_rgx()
{
tc16_rgx = newsmob(&rgxsmob);
make_subr(s_regcomp, tc7_subr_2o, lregcomp);
make_subr(s_regexec, tc7_subr_2, lregexec);
make_subr(s_regmatp, tc7_subr_2, lregmatp);
make_subr(s_regerror, tc7_subr_1, lregerror);
make_subr(s_regsearch, tc7_lsubr_2, lregsearch);
make_subr(s_regsearchv, tc7_lsubr_2, lregsearchv);
make_subr(s_regmatch, tc7_lsubr_2, lregmatch);
make_subr(s_regmatchv, tc7_lsubr_2, lregmatchv);
make_subr(s_stringsplit, tc7_subr_2, lstringsplit);
make_subr(s_stringsplitv, tc7_subr_2, lstringsplitv);
make_subr(s_stringedit, tc7_lsubr_2, lstringedit);
add_feature(s_regex);
}