123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135 |
- /* This file is part of the software similarity tester SIM.
- Written by Dick Grune, Vrije Universiteit, Amsterdam.
- $Id: algollike.c,v 2.4 2005/02/20 17:02:59 dick Exp $
- */
- /* This module implements the routines InitLanguage, MayBeStartOfRun
- and CheckRun for ALGOL-like languages, in which it is meaningful
- and useful to isolate function bodies.
- It requires the user to define, preferably in Xlang.l, four token
- sets, represented as TOKEN[] and terminated by NOTOKEN:
- TOKEN NonFinals[] tokens that may not end a chunk
- TOKEN NonInitials[] tokens that may not start a chunk
- TOKEN Openers[] openers of parentheses that must balance
- in functions
- TOKEN Closers[] the corresponding closers, in the same order
- */
- #include "options.h"
- #include "token.h"
- #include "algollike.h"
- /* Arrays for fast identification tests for tokens. Each token is
- identified by its position in the set + 1. For example, if T is
- the n-th Opener, openers[TOKEN2int(tk)] == n+1.
- */
- static char non_finals[256];
- static char non_initials[256];
- static char openers[256];
- static char closers[256];
- static void cvt2bittable(const TOKEN *tl, char bt[256]);
- static unsigned int largest_function(const TOKEN *str, unsigned int size);
- void
- InitLanguage(void) {
- /* convert the token sets to bitmaps */
- cvt2bittable(NonFinals, non_finals);
- cvt2bittable(NonInitials, non_initials);
- cvt2bittable(Openers, openers);
- cvt2bittable(Closers, closers);
- }
- static void
- cvt2bittable(const TOKEN *tl, char bt[256]) {
- int i;
- int cnt = 1;
- for (i = 0; !TOKEN_EQ(tl[i], NOTOKEN); i++) {
- bt[TOKEN2int(tl[i])] = cnt++;
- }
- }
- int
- MayBeStartOfRun(TOKEN tk) {
- return !non_initials[TOKEN2int(tk)];
- }
- unsigned int
- CheckRun(const TOKEN *str, unsigned int size) {
- /* Checks the run starting at str with length size for
- acceptability in the language. Cuts from the end if
- necessary and returns the accepted length, which may
- be zero.
- */
- if (option_set('f')) {
- /* reduce to a function-like form first */
- size = largest_function(str, size);
- }
- while ( /* there is trailing garbage */
- size != 0 && non_finals[TOKEN2int(str[size-1])]
- ) {
- /* remove it */
- size--;
- }
- return size;
- }
- static unsigned int
- largest_function(const TOKEN *str, unsigned int size) {
- /* Returns the size of the longest sequence starting at
- str[0] and not containing unbalanced parentheses.
- Does not check the nesting of the parentheses, but then,
- sim is syntax-free anyway.
- */
- register unsigned int mrb_size = 0; /* most recent balancing size */
- register unsigned int pos;
- register int i;
- int balance_count[256];
- int n_imbalances;
- /* clear administration */
- n_imbalances = 0;
- for (i = 0; i < 255; i++) {
- balance_count[i] = 0;
- }
- /* scan str[] and see how far we get */
- for (pos = 0; pos < size; pos++) {
- register int tkval = TOKEN2int(str[pos]);
- register int pp; /* parenthesis position */
- /* account for openers */
- if ((pp = openers[tkval])) {
- if (balance_count[pp] == 0) {
- /* about to create an imbalance */
- n_imbalances++;
- }
- balance_count[pp]++;
- }
- /* account for closers */
- if ((pp = closers[tkval])) {
- if (balance_count[pp] == 0) {
- /* this is one Closer too many */
- return mrb_size;
- }
- balance_count[pp]--;
- if (balance_count[pp] == 0) {
- /* we just cleared an imbalance */
- n_imbalances--;
- }
- }
- if (n_imbalances == 0) {
- /* register balance point */
- mrb_size = pos + 1;
- }
- }
- return mrb_size;
- }
|