fpc/utils/sim_pasc/algollike.c

136 lines
3.4 KiB
C

/* This file is part of the software similarity tester SIM.
Written by Dick Grune, Vrije Universiteit, Amsterdam.
$Id: algollike.c,v 2.4 2005/02/20 17:02:59 dick Exp $
*/
/* This module implements the routines InitLanguage, MayBeStartOfRun
and CheckRun for ALGOL-like languages, in which it is meaningful
and useful to isolate function bodies.
It requires the user to define, preferably in Xlang.l, four token
sets, represented as TOKEN[] and terminated by NOTOKEN:
TOKEN NonFinals[] tokens that may not end a chunk
TOKEN NonInitials[] tokens that may not start a chunk
TOKEN Openers[] openers of parentheses that must balance
in functions
TOKEN Closers[] the corresponding closers, in the same order
*/
#include "options.h"
#include "token.h"
#include "algollike.h"
/* Arrays for fast identification tests for tokens. Each token is
identified by its position in the set + 1. For example, if T is
the n-th Opener, openers[TOKEN2int(tk)] == n+1.
*/
static char non_finals[256];
static char non_initials[256];
static char openers[256];
static char closers[256];
static void cvt2bittable(const TOKEN *tl, char bt[256]);
static unsigned int largest_function(const TOKEN *str, unsigned int size);
void
InitLanguage(void) {
/* convert the token sets to bitmaps */
cvt2bittable(NonFinals, non_finals);
cvt2bittable(NonInitials, non_initials);
cvt2bittable(Openers, openers);
cvt2bittable(Closers, closers);
}
static void
cvt2bittable(const TOKEN *tl, char bt[256]) {
int i;
int cnt = 1;
for (i = 0; !TOKEN_EQ(tl[i], NOTOKEN); i++) {
bt[TOKEN2int(tl[i])] = cnt++;
}
}
int
MayBeStartOfRun(TOKEN tk) {
return !non_initials[TOKEN2int(tk)];
}
unsigned int
CheckRun(const TOKEN *str, unsigned int size) {
/* Checks the run starting at str with length size for
acceptability in the language. Cuts from the end if
necessary and returns the accepted length, which may
be zero.
*/
if (option_set('f')) {
/* reduce to a function-like form first */
size = largest_function(str, size);
}
while ( /* there is trailing garbage */
size != 0 && non_finals[TOKEN2int(str[size-1])]
) {
/* remove it */
size--;
}
return size;
}
static unsigned int
largest_function(const TOKEN *str, unsigned int size) {
/* Returns the size of the longest sequence starting at
str[0] and not containing unbalanced parentheses.
Does not check the nesting of the parentheses, but then,
sim is syntax-free anyway.
*/
register unsigned int mrb_size = 0; /* most recent balancing size */
register unsigned int pos;
register int i;
int balance_count[256];
int n_imbalances;
/* clear administration */
n_imbalances = 0;
for (i = 0; i < 255; i++) {
balance_count[i] = 0;
}
/* scan str[] and see how far we get */
for (pos = 0; pos < size; pos++) {
register int tkval = TOKEN2int(str[pos]);
register int pp; /* parenthesis position */
/* account for openers */
if ((pp = openers[tkval])) {
if (balance_count[pp] == 0) {
/* about to create an imbalance */
n_imbalances++;
}
balance_count[pp]++;
}
/* account for closers */
if ((pp = closers[tkval])) {
if (balance_count[pp] == 0) {
/* this is one Closer too many */
return mrb_size;
}
balance_count[pp]--;
if (balance_count[pp] == 0) {
/* we just cleared an imbalance */
n_imbalances--;
}
}
if (n_imbalances == 0) {
/* register balance point */
mrb_size = pos + 1;
}
}
return mrb_size;
}