12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390239123922393239423952396239723982399240024012402240324042405240624072408240924102411241224132414241524162417241824192420242124222423242424252426242724282429243024312432243324342435243624372438243924402441244224432444244524462447244824492450245124522453245424552456245724582459246024612462246324642465246624672468246924702471247224732474247524762477247824792480248124822483248424852486248724882489249024912492249324942495249624972498249925002501250225032504250525062507250825092510251125122513251425152516251725182519252025212522252325242525252625272528252925302531253225332534253525362537253825392540254125422543254425452546254725482549255025512552255325542555255625572558255925602561256225632564256525662567256825692570257125722573257425752576257725782579258025812582258325842585258625872588258925902591259225932594259525962597259825992600260126022603260426052606260726082609261026112612261326142615261626172618261926202621262226232624262526262627262826292630263126322633263426352636263726382639264026412642264326442645264626472648264926502651265226532654265526562657265826592660266126622663266426652666266726682669267026712672267326742675267626772678267926802681268226832684268526862687268826892690269126922693269426952696269726982699270027012702270327042705270627072708270927102711271227132714271527162717271827192720272127222723272427252726272727282729273027312732273327342735273627372738273927402741274227432744274527462747274827492750275127522753275427552756275727582759276027612762276327642765276627672768276927702771277227732774277527762777277827792780278127822783278427852786278727882789279027912792279327942795279627972798279928002801280228032804280528062807280828092810281128122813281428152816281728182819282028212822282328242825282628272828282928302831283228332834283528362837283828392840284128422843284428452846284728482849285028512852285328542855285628572858285928602861286228632864286528662867286828692870287128722873287428752876287728782879288028812882288328842885288628872888288928902891289228932894289528962897289828992900290129022903290429052906290729082909291029112912291329142915291629172918291929202921292229232924292529262927292829292930293129322933293429352936293729382939294029412942294329442945294629472948294929502951295229532954295529562957295829592960296129622963296429652966296729682969297029712972297329742975297629772978297929802981298229832984298529862987298829892990299129922993299429952996299729982999300030013002300330043005300630073008300930103011301230133014301530163017301830193020302130223023302430253026302730283029303030313032303330343035303630373038303930403041304230433044304530463047304830493050305130523053305430553056305730583059306030613062306330643065306630673068306930703071307230733074307530763077307830793080308130823083308430853086308730883089309030913092309330943095309630973098309931003101310231033104310531063107310831093110311131123113311431153116311731183119312031213122312331243125312631273128312931303131313231333134313531363137313831393140314131423143314431453146314731483149315031513152315331543155315631573158315931603161316231633164316531663167316831693170317131723173317431753176317731783179318031813182318331843185318631873188318931903191319231933194319531963197319831993200320132023203320432053206320732083209321032113212321332143215321632173218321932203221322232233224322532263227322832293230323132323233323432353236323732383239324032413242324332443245324632473248324932503251325232533254325532563257325832593260326132623263326432653266326732683269327032713272327332743275327632773278327932803281328232833284328532863287328832893290329132923293329432953296329732983299330033013302330333043305330633073308330933103311331233133314331533163317331833193320332133223323332433253326332733283329333033313332333333343335333633373338333933403341334233433344334533463347334833493350335133523353335433553356335733583359336033613362336333643365336633673368336933703371337233733374337533763377337833793380338133823383338433853386338733883389339033913392339333943395339633973398339934003401340234033404340534063407340834093410341134123413341434153416341734183419342034213422342334243425342634273428342934303431343234333434343534363437343834393440344134423443344434453446344734483449345034513452345334543455345634573458345934603461346234633464346534663467346834693470347134723473347434753476347734783479348034813482348334843485348634873488348934903491349234933494349534963497349834993500350135023503350435053506350735083509351035113512351335143515351635173518351935203521352235233524352535263527352835293530353135323533353435353536353735383539354035413542354335443545354635473548354935503551355235533554355535563557355835593560356135623563356435653566356735683569357035713572357335743575357635773578357935803581358235833584358535863587358835893590359135923593359435953596359735983599360036013602360336043605360636073608360936103611361236133614361536163617361836193620362136223623362436253626362736283629363036313632363336343635363636373638363936403641364236433644364536463647364836493650365136523653365436553656365736583659366036613662366336643665366636673668366936703671367236733674367536763677367836793680368136823683368436853686368736883689369036913692369336943695369636973698369937003701370237033704370537063707370837093710371137123713371437153716371737183719372037213722372337243725372637273728372937303731373237333734373537363737373837393740374137423743374437453746374737483749375037513752375337543755375637573758375937603761376237633764376537663767376837693770377137723773377437753776377737783779378037813782378337843785378637873788378937903791379237933794379537963797379837993800380138023803380438053806380738083809381038113812381338143815381638173818381938203821382238233824382538263827382838293830383138323833383438353836383738383839384038413842384338443845384638473848384938503851385238533854385538563857385838593860386138623863386438653866386738683869387038713872387338743875387638773878387938803881388238833884388538863887388838893890389138923893389438953896389738983899390039013902390339043905390639073908390939103911391239133914391539163917391839193920392139223923392439253926392739283929393039313932393339343935393639373938393939403941394239433944394539463947394839493950395139523953395439553956395739583959396039613962396339643965396639673968396939703971397239733974397539763977397839793980398139823983398439853986398739883989399039913992399339943995399639973998399940004001400240034004400540064007400840094010401140124013401440154016401740184019402040214022402340244025402640274028402940304031403240334034403540364037403840394040404140424043404440454046404740484049405040514052405340544055405640574058405940604061406240634064406540664067406840694070407140724073407440754076407740784079 |
- <html><head><title>flex</title></head><body>
- <ul>
- </ul><H2>NAME </H2><ul>
- flex - fast lexical analyzer generator
- </ul><H2>SYNOPSIS </H2><ul>
- <b>flex</b>
- <b>[-bcdfhilnpstvwBFILTV78+?</b> <b>-C[aefFmr]</b> <b>-ooutput</b> <b>-Pprefix</b> <b>-Sskeleton]</b>
- <b>[--help</b> <b>--version]</b>
- <i>[filename</i> <i>...]</i>
- </ul><H2>OVERVIEW </H2><ul>
- This manual describes
- <i>flex,</i>
- a tool for generating programs that perform pattern-matching on text. The
- manual includes both tutorial and reference sections:
- <pre>
- <p><br> Description
- <br> a brief overview of the tool
- <br>
- <p><br> Some Simple Examples
- <br>
- <p><br> Format Of The Input File
- <br>
- <p><br> Patterns
- <br> the extended regular expressions used by flex
- <br>
- <p><br> How The Input Is Matched
- <br> the rules for determining what has been matched
- <br>
- <p><br> Actions
- <br> how to specify what to do when a pattern is matched
- <br>
- <p><br> The Generated Scanner
- <br> details regarding the scanner that flex produces;
- <br> how to control the input source
- <br>
- <p><br> Start Conditions
- <br> introducing context into your scanners, and
- <br> managing "mini-scanners"
- <br>
- <p><br> Multiple Input Buffers
- <br> how to manipulate multiple input sources; how to
- <br> scan from strings instead of files
- <br>
- <p><br> End-of-file Rules
- <br> special rules for matching the end of the input
- <br>
- <p><br> Miscellaneous Macros
- <br> a summary of macros available to the actions
- <br>
- <p><br> Values Available To The User
- <br> a summary of values available to the actions
- <br>
- <p><br> Interfacing With Yacc
- <br> connecting flex scanners together with yacc parsers
- <br>
- <p><br> Options
- <br> flex command-line options, and the "%option"
- <br> directive
- <br>
- <p><br> Performance Considerations
- <br> how to make your scanner go as fast as possible
- <br>
- <p><br> Generating C++ Scanners
- <br> the (experimental) facility for generating C++
- <br> scanner classes
- <br>
- <p><br> Incompatibilities With Lex And POSIX
- <br> how flex differs from AT&T lex and the POSIX lex
- <br> standard
- <br>
- <p><br> Diagnostics
- <br> those error messages produced by flex (or scanners
- <br> it generates) whose meanings might not be apparent
- <br>
- <p><br> Files
- <br> files used by flex
- <br>
- <p><br> Deficiencies / Bugs
- <br> known problems with flex
- <br>
- <p><br> See Also
- <br> other documentation, related tools
- <br>
- <p><br> Author
- <br> includes contact information
- <br>
- <p><br></pre>
- </ul><H2>DESCRIPTION </H2><ul>
- <i>flex</i>
- is a tool for generating
- <i>scanners:</i>
- programs which recognized lexical patterns in text.
- <i>flex</i>
- reads
- the given input files, or its standard input if no file names are given,
- for a description of a scanner to generate. The description is in
- the form of pairs
- of regular expressions and C code, called
- <i>rules.</i> <i>flex</i>
- generates as output a C source file,
- <b>lex.yy.c,</b>
- which defines a routine
- <b>yylex().</b>
- This file is compiled and linked with the
- <b>-lfl</b>
- library to produce an executable. When the executable is run,
- it analyzes its input for occurrences
- of the regular expressions. Whenever it finds one, it executes
- the corresponding C code.
- </ul><H2>SOME SIMPLE EXAMPLES </H2><ul>
- <p>
- First some simple examples to get the flavor of how one uses
- <i>flex.</i>
- The following
- <i>flex</i>
- input specifies a scanner which whenever it encounters the string
- "username" will replace it with the user's login name:
- <pre>
- <p><br> %%
- <br> username printf( "%s", getlogin() );
- <br>
- <p><br></pre>
- By default, any text not matched by a
- <i>flex</i>
- scanner
- is copied to the output, so the net effect of this scanner is
- to copy its input file to its output with each occurrence
- of "username" expanded.
- In this input, there is just one rule. "username" is the
- <i>pattern</i>
- and the "printf" is the
- <i>action.</i>
- The "%%" marks the beginning of the rules.
- <p>
- Here's another simple example:
- <pre>
- <p><br> int num_lines = 0, num_chars = 0;
- <br>
- <p><br> %%
- <br> \n ++num_lines; ++num_chars;
- <br> . ++num_chars;
- <br>
- <p><br> %%
- <br> main()
- <br> {
- <br> yylex();
- <br> printf( "# of lines = %d, # of chars = %d\n",
- <br> num_lines, num_chars );
- <br> }
- <br>
- <p><br></pre>
- This scanner counts the number of characters and the number
- of lines in its input (it produces no output other than the
- final report on the counts). The first line
- declares two globals, "num_lines" and "num_chars", which are accessible
- both inside
- <b>yylex()</b>
- and in the
- <b>main()</b>
- routine declared after the second "%%". There are two rules, one
- which matches a newline ("\n") and increments both the line count and
- the character count, and one which matches any character other than
- a newline (indicated by the "." regular expression).
- <p>
- A somewhat more complicated example:
- <pre>
- <p><br> /* scanner for a toy Pascal-like language */
- <br>
- <p><br> %{
- <br> /* need this for the call to atof() below */
- <br> #include <math.h>
- <br> %}
- <br>
- <p><br> DIGIT [0-9]
- <br> ID [a-z][a-z0-9]*
- <br>
- <p><br> %%
- <br>
- <p><br> {DIGIT}+ {
- <br> printf( "An integer: %s (%d)\n", yytext,
- <br> atoi( yytext ) );
- <br> }
- <br>
- <p><br> {DIGIT}+"."{DIGIT}* {
- <br> printf( "A float: %s (%g)\n", yytext,
- <br> atof( yytext ) );
- <br> }
- <br>
- <p><br> if|then|begin|end|procedure|function {
- <br> printf( "A keyword: %s\n", yytext );
- <br> }
- <br>
- <p><br> {ID} printf( "An identifier: %s\n", yytext );
- <br>
- <p><br> "+"|"-"|"*"|"/" printf( "An operator: %s\n", yytext );
- <br>
- <p><br> "{"[^}\n]*"}" /* eat up one-line comments */
- <br>
- <p><br> [ \t\n]+ /* eat up whitespace */
- <br>
- <p><br> . printf( "Unrecognized character: %s\n", yytext );
- <br>
- <p><br> %%
- <br>
- <p><br> main( argc, argv )
- <br> int argc;
- <br> char **argv;
- <br> {
- <br> ++argv, --argc; /* skip over program name */
- <br> if ( argc > 0 )
- <br> yyin = fopen( argv[0], "r" );
- <br> else
- <br> yyin = stdin;
- <br>
- <br> yylex();
- <br> }
- <br>
- <p><br></pre>
- This is the beginnings of a simple scanner for a language like
- Pascal. It identifies different types of
- <i>tokens</i>
- and reports on what it has seen.
- <p>
- The details of this example will be explained in the following
- sections.
- </ul><H2>FORMAT OF THE INPUT FILE </H2><ul>
- The
- <i>flex</i>
- input file consists of three sections, separated by a line with just
- <b>%%</b>
- in it:
- <pre>
- <p><br> definitions
- <br> %%
- <br> rules
- <br> %%
- <br> user code
- <br>
- <p><br></pre>
- The
- <i>definitions</i>
- section contains declarations of simple
- <i>name</i>
- definitions to simplify the scanner specification, and declarations of
- <i>start</i> <i>conditions,</i>
- which are explained in a later section.
- <p>
- Name definitions have the form:
- <pre>
- <p><br> name definition
- <br>
- <p><br></pre>
- The "name" is a word beginning with a letter or an underscore ('_')
- followed by zero or more letters, digits, '_', or '-' (dash).
- The definition is taken to begin at the first non-white-space character
- following the name and continuing to the end of the line.
- The definition can subsequently be referred to using "{name}", which
- will expand to "(definition)". For example,
- <pre>
- <p><br> DIGIT [0-9]
- <br> ID [a-z][a-z0-9]*
- <br>
- <p><br></pre>
- defines "DIGIT" to be a regular expression which matches a
- single digit, and
- "ID" to be a regular expression which matches a letter
- followed by zero-or-more letters-or-digits.
- A subsequent reference to
- <pre>
- <p><br> {DIGIT}+"."{DIGIT}*
- <br>
- <p><br></pre>
- is identical to
- <pre>
- <p><br> ([0-9])+"."([0-9])*
- <br>
- <p><br></pre>
- and matches one-or-more digits followed by a '.' followed
- by zero-or-more digits.
- <p>
- The
- <i>rules</i>
- section of the
- <i>flex</i>
- input contains a series of rules of the form:
- <pre>
- <p><br> pattern action
- <br>
- <p><br></pre>
- where the pattern must be unindented and the action must begin
- on the same line.
- <p>
- See below for a further description of patterns and actions.
- <p>
- Finally, the user code section is simply copied to
- <b>lex.yy.c</b>
- verbatim.
- It is used for companion routines which call or are called
- by the scanner. The presence of this section is optional;
- if it is missing, the second
- <b>%%</b>
- in the input file may be skipped, too.
- <p>
- In the definitions and rules sections, any
- <i>indented</i>
- text or text enclosed in
- <b>%{</b>
- and
- <b>%}</b>
- is copied verbatim to the output (with the %{}'s removed).
- The %{}'s must appear unindented on lines by themselves.
- <p>
- In the rules section,
- any indented or %{} text appearing before the
- first rule may be used to declare variables
- which are local to the scanning routine and (after the declarations)
- code which is to be executed whenever the scanning routine is entered.
- Other indented or %{} text in the rule section is still copied to the output,
- but its meaning is not well-defined and it may well cause compile-time
- errors (this feature is present for
- <i>POSIX</i>
- compliance; see below for other such features).
- <p>
- In the definitions section (but not in the rules section),
- an unindented comment (i.e., a line
- beginning with "/*") is also copied verbatim to the output up
- to the next "*/".
- </ul><H2>PATTERNS </H2><ul>
- The patterns in the input are written using an extended set of regular
- expressions. These are:
- <pre>
- <p><br> x match the character 'x'
- <br> . any character (byte) except newline
- <br> [xyz] a "character class"; in this case, the pattern
- <br> matches either an 'x', a 'y', or a 'z'
- <br> [abj-oZ] a "character class" with a range in it; matches
- <br> an 'a', a 'b', any letter from 'j' through 'o',
- <br> or a 'Z'
- <br> [^A-Z] a "negated character class", i.e., any character
- <br> but those in the class. In this case, any
- <br> character EXCEPT an uppercase letter.
- <br> [^A-Z\n] any character EXCEPT an uppercase letter or
- <br> a newline
- <br> r* zero or more r's, where r is any regular expression
- <br> r+ one or more r's
- <br> r? zero or one r's (that is, "an optional r")
- <br> r{2,5} anywhere from two to five r's
- <br> r{2,} two or more r's
- <br> r{4} exactly 4 r's
- <br> {name} the expansion of the "name" definition
- <br> (see above)
- <br> "[xyz]\"foo"
- <br> the literal string: [xyz]"foo
- <br> \X if X is an 'a', 'b', 'f', 'n', 'r', 't', or 'v',
- <br> then the ANSI-C interpretation of \x.
- <br> Otherwise, a literal 'X' (used to escape
- <br> operators such as '*')
- <br> \0 a NUL character (ASCII code 0)
- <br> \123 the character with octal value 123
- <br> \x2a the character with hexadecimal value 2a
- <br> (r) match an r; parentheses are used to override
- <br> precedence (see below)
- <br>
- <p><br>
- <p><br> rs the regular expression r followed by the
- <br> regular expression s; called "concatenation"
- <br>
- <p><br>
- <p><br> r|s either an r or an s
- <br>
- <p><br>
- <p><br> r/s an r but only if it is followed by an s. The
- <br> text matched by s is included when determining
- <br> whether this rule is the "longest match",
- <br> but is then returned to the input before
- <br> the action is executed. So the action only
- <br> sees the text matched by r. This type
- <br> of pattern is called trailing context".
- <br> (There are some combinations of r/s that flex
- <br> cannot match correctly; see notes in the
- <br> Deficiencies / Bugs section below regarding
- <br> "dangerous trailing context".)
- <br> ^r an r, but only at the beginning of a line (i.e.,
- <br> which just starting to scan, or right after a
- <br> newline has been scanned).
- <br> r$ an r, but only at the end of a line (i.e., just
- <br> before a newline). Equivalent to "r/\n".
- <br>
- <p><br> Note that flex's notion of "newline" is exactly
- <br> whatever the C compiler used to compile flex
- <br> interprets '\n' as; in particular, on some DOS
- <br> systems you must either filter out \r's in the
- <br> input yourself, or explicitly use r/\r\n for "r$".
- <br>
- <p><br>
- <p><br> <s>r an r, but only in start condition s (see
- <br> below for discussion of start conditions)
- <br> <s1,s2,s3>r
- <br> same, but in any of start conditions s1,
- <br> s2, or s3
- <br> <*>r an r in any start condition, even an exclusive one.
- <br>
- <p><br>
- <p><br> <<EOF>> an end-of-file
- <br> <s1,s2><<EOF>>
- <br> an end-of-file when in start condition s1 or s2
- <br>
- <p><br></pre>
- Note that inside of a character class, all regular expression operators
- lose their special meaning except escape ('\') and the character class
- operators, '-', ']', and, at the beginning of the class, '^'.
- <p>
- The regular expressions listed above are grouped according to
- precedence, from highest precedence at the top to lowest at the bottom.
- Those grouped together have equal precedence. For example,
- <pre>
- <p><br> foo|bar*
- <br>
- <p><br></pre>
- is the same as
- <pre>
- <p><br> (foo)|(ba(r*))
- <br>
- <p><br></pre>
- since the '*' operator has higher precedence than concatenation,
- and concatenation higher than alternation ('|'). This pattern
- therefore matches
- <i>either</i>
- the string "foo"
- <i>or</i>
- the string "ba" followed by zero-or-more r's.
- To match "foo" or zero-or-more "bar"'s, use:
- <pre>
- <p><br> foo|(bar)*
- <br>
- <p><br></pre>
- and to match zero-or-more "foo"'s-or-"bar"'s:
- <pre>
- <p><br> (foo|bar)*
- <br>
- <p><br></pre>
- <p>
- In addition to characters and ranges of characters, character classes
- can also contain character class
- <i>expressions.</i>
- These are expressions enclosed inside
- <b>[:</b>
- and
- <b>:]</b>
- delimiters (which themselves must appear between the '[' and ']' of the
- character class; other elements may occur inside the character class, too).
- The valid expressions are:
- <pre>
- <p><br> [:alnum:] [:alpha:] [:blank:]
- <br> [:cntrl:] [:digit:] [:graph:]
- <br> [:lower:] [:print:] [:punct:]
- <br> [:space:] [:upper:] [:xdigit:]
- <br>
- <p><br></pre>
- These expressions all designate a set of characters equivalent to
- the corresponding standard C
- <b>isXXX</b>
- function. For example,
- <b>[:alnum:]</b>
- designates those characters for which
- <b>isalnum()</b>
- returns true - i.e., any alphabetic or numeric.
- Some systems don't provide
- <b>isblank(),</b>
- so flex defines
- <b>[:blank:]</b>
- as a blank or a tab.
- <p>
- For example, the following character classes are all equivalent:
- <pre>
- <p><br> [[:alnum:]]
- <br> [[:alpha:][:digit:]
- <br> [[:alpha:]0-9]
- <br> [a-zA-Z0-9]
- <br>
- <p><br></pre>
- If your scanner is case-insensitive (the
- <b>-i</b>
- flag), then
- <b>[:upper:]</b>
- and
- <b>[:lower:]</b>
- are equivalent to
- <b>[:alpha:].</b>
- <p>
- Some notes on patterns:
- <p><dl compact><dt>-<dd>A negated character class such as the example "[^A-Z]"
- above
- <i>will</i> <i>match</i> <i>a</i> <i>newline</i>
- unless "\n" (or an equivalent escape sequence) is one of the
- characters explicitly present in the negated character class
- (e.g., "[^A-Z\n]"). This is unlike how many other regular
- expression tools treat negated character classes, but unfortunately
- the inconsistency is historically entrenched.
- Matching newlines means that a pattern like [^"]* can match the entire
- input unless there's another quote in the input.
- <dt>-<dd>A rule can have at most one instance of trailing context (the '/' operator
- or the '$' operator). The start condition, '^', and "<<EOF>>" patterns
- can only occur at the beginning of a pattern, and, as well as with '/' and '$',
- cannot be grouped inside parentheses. A '^' which does not occur at
- the beginning of a rule or a '$' which does not occur at the end of
- a rule loses its special properties and is treated as a normal character.
- <dt><dd>The following are illegal:
- <pre>
- <p><br> foo/bar$
- <br> <sc1>foo<sc2>bar
- <br>
- <p><br></pre>
- Note that the first of these, can be written "foo/bar\n".
- <dt><dd>The following will result in '$' or '^' being treated as a normal character:
- <pre>
- <p><br> foo|(bar$)
- <br> foo|^bar
- <br>
- <p><br></pre>
- If what's wanted is a "foo" or a bar-followed-by-a-newline, the following
- could be used (the special '|' action is explained below):
- <pre>
- <p><br> foo |
- <br> bar$ /* action goes here */
- <br>
- <p><br></pre>
- A similar trick will work for matching a foo or a
- bar-at-the-beginning-of-a-line.
- </dl>
- </ul><H2>HOW THE INPUT IS MATCHED </H2><ul>
- When the generated scanner is run, it analyzes its input looking
- for strings which match any of its patterns. If it finds more than
- one match, it takes the one matching the most text (for trailing
- context rules, this includes the length of the trailing part, even
- though it will then be returned to the input). If it finds two
- or more matches of the same length, the
- rule listed first in the
- <i>flex</i>
- input file is chosen.
- <p>
- Once the match is determined, the text corresponding to the match
- (called the
- <i>token)</i>
- is made available in the global character pointer
- <b>yytext,</b>
- and its length in the global integer
- <b>yyleng.</b>
- The
- <i>action</i>
- corresponding to the matched pattern is then executed (a more
- detailed description of actions follows), and then the remaining
- input is scanned for another match.
- <p>
- If no match is found, then the
- <i>default</i> <i>rule</i>
- is executed: the next character in the input is considered matched and
- copied to the standard output. Thus, the simplest legal
- <i>flex</i>
- input is:
- <pre>
- <p><br> %%
- <br>
- <p><br></pre>
- which generates a scanner that simply copies its input (one character
- at a time) to its output.
- <p>
- Note that
- <b>yytext</b>
- can be defined in two different ways: either as a character
- <i>pointer</i>
- or as a character
- <i>array.</i>
- You can control which definition
- <i>flex</i>
- uses by including one of the special directives
- <b>%pointer</b>
- or
- <b>%array</b>
- in the first (definitions) section of your flex input. The default is
- <b>%pointer,</b>
- unless you use the
- <b>-l</b>
- lex compatibility option, in which case
- <b>yytext</b>
- will be an array.
- The advantage of using
- <b>%pointer</b>
- is substantially faster scanning and no buffer overflow when matching
- very large tokens (unless you run out of dynamic memory). The disadvantage
- is that you are restricted in how your actions can modify
- <b>yytext</b>
- (see the next section), and calls to the
- <b>unput()</b>
- function destroys the present contents of
- <b>yytext,</b>
- which can be a considerable porting headache when moving between different
- <i>lex</i>
- versions.
- <p>
- The advantage of
- <b>%array</b>
- is that you can then modify
- <b>yytext</b>
- to your heart's content, and calls to
- <b>unput()</b>
- do not destroy
- <b>yytext</b>
- (see below). Furthermore, existing
- <i>lex</i>
- programs sometimes access
- <b>yytext</b>
- externally using declarations of the form:
- <pre>
- extern char yytext[];
- <br></pre>
- This definition is erroneous when used with
- <b>%pointer,</b>
- but correct for
- <b>%array.</b>
- <p>
- <b>%array</b>
- defines
- <b>yytext</b>
- to be an array of
- <b>YYLMAX</b>
- characters, which defaults to a fairly large value. You can change
- the size by simply #define'ing
- <b>YYLMAX</b>
- to a different value in the first section of your
- <i>flex</i>
- input. As mentioned above, with
- <b>%pointer</b>
- yytext grows dynamically to accommodate large tokens. While this means your
- <b>%pointer</b>
- scanner can accommodate very large tokens (such as matching entire blocks
- of comments), bear in mind that each time the scanner must resize
- <b>yytext</b>
- it also must rescan the entire token from the beginning, so matching such
- tokens can prove slow.
- <b>yytext</b>
- presently does
- <i>not</i>
- dynamically grow if a call to
- <b>unput()</b>
- results in too much text being pushed back; instead, a run-time error results.
- <p>
- Also note that you cannot use
- <b>%array</b>
- with C++ scanner classes
- (the
- <b>c++</b>
- option; see below).
- </ul><H2>ACTIONS </H2><ul>
- Each pattern in a rule has a corresponding action, which can be any
- arbitrary C statement. The pattern ends at the first non-escaped
- whitespace character; the remainder of the line is its action. If the
- action is empty, then when the pattern is matched the input token
- is simply discarded. For example, here is the specification for a program
- which deletes all occurrences of "zap me" from its input:
- <pre>
- <p><br> %%
- <br> "zap me"
- <br>
- <p><br></pre>
- (It will copy all other characters in the input to the output since
- they will be matched by the default rule.)
- <p>
- Here is a program which compresses multiple blanks and tabs down to
- a single blank, and throws away whitespace found at the end of a line:
- <pre>
- <p><br> %%
- <br> [ \t]+ putchar( ' ' );
- <br> [ \t]+$ /* ignore this token */
- <br>
- <p><br></pre>
- <p>
- If the action contains a '{', then the action spans till the balancing '}'
- is found, and the action may cross multiple lines.
- <i>flex</i>
- knows about C strings and comments and won't be fooled by braces found
- within them, but also allows actions to begin with
- <b>%{</b>
- and will consider the action to be all the text up to the next
- <b>%}</b>
- (regardless of ordinary braces inside the action).
- <p>
- An action consisting solely of a vertical bar ('|') means "same as
- the action for the next rule." See below for an illustration.
- <p>
- Actions can include arbitrary C code, including
- <b>return</b>
- statements to return a value to whatever routine called
- <b>yylex().</b>
- Each time
- <b>yylex()</b>
- is called it continues processing tokens from where it last left
- off until it either reaches
- the end of the file or executes a return.
- <p>
- Actions are free to modify
- <b>yytext</b>
- except for lengthening it (adding
- characters to its end--these will overwrite later characters in the
- input stream). This however does not apply when using
- <b>%array</b>
- (see above); in that case,
- <b>yytext</b>
- may be freely modified in any way.
- <p>
- Actions are free to modify
- <b>yyleng</b>
- except they should not do so if the action also includes use of
- <b>yymore()</b>
- (see below).
- <p>
- There are a number of special directives which can be included within
- an action:
- <p><dl compact><dt>-<dd><b>ECHO</b>
- copies yytext to the scanner's output.
- <dt>-<dd><b>BEGIN</b>
- followed by the name of a start condition places the scanner in the
- corresponding start condition (see below).
- <dt>-<dd><b>REJECT</b>
- directs the scanner to proceed on to the "second best" rule which matched the
- input (or a prefix of the input). The rule is chosen as described
- above in "How the Input is Matched", and
- <b>yytext</b>
- and
- <b>yyleng</b>
- set up appropriately.
- It may either be one which matched as much text
- as the originally chosen rule but came later in the
- <i>flex</i>
- input file, or one which matched less text.
- For example, the following will both count the
- words in the input and call the routine special() whenever "frob" is seen:
- <pre>
- <p><br> int word_count = 0;
- <br> %%
- <br>
- <p><br> frob special(); REJECT;
- <br> [^ \t\n]+ ++word_count;
- <br>
- <p><br></pre>
- Without the
- <b>REJECT,</b>
- any "frob"'s in the input would not be counted as words, since the
- scanner normally executes only one action per token.
- Multiple
- <b>REJECT's</b>
- are allowed, each one finding the next best choice to the currently
- active rule. For example, when the following scanner scans the token
- "abcd", it will write "abcdabcaba" to the output:
- <pre>
- <p><br> %%
- <br> a |
- <br> ab |
- <br> abc |
- <br> abcd ECHO; REJECT;
- <br> .|\n /* eat up any unmatched character */
- <br>
- <p><br></pre>
- (The first three rules share the fourth's action since they use
- the special '|' action.)
- <b>REJECT</b>
- is a particularly expensive feature in terms of scanner performance;
- if it is used in
- <i>any</i>
- of the scanner's actions it will slow down
- <i>all</i>
- of the scanner's matching. Furthermore,
- <b>REJECT</b>
- cannot be used with the
- <i>-Cf</i>
- or
- <i>-CF</i>
- options (see below).
- <dt><dd>Note also that unlike the other special actions,
- <b>REJECT</b>
- is a
- <i>branch;</i>
- code immediately following it in the action will
- <i>not</i>
- be executed.
- <dt>-<dd><b>yymore()</b>
- tells the scanner that the next time it matches a rule, the corresponding
- token should be
- <i>appended</i>
- onto the current value of
- <b>yytext</b>
- rather than replacing it. For example, given the input "mega-kludge"
- the following will write "mega-mega-kludge" to the output:
- <pre>
- <p><br> %%
- <br> mega- ECHO; yymore();
- <br> kludge ECHO;
- <br>
- <p><br></pre>
- First "mega-" is matched and echoed to the output. Then "kludge"
- is matched, but the previous "mega-" is still hanging around at the
- beginning of
- <b>yytext</b>
- so the
- <b>ECHO</b>
- for the "kludge" rule will actually write "mega-kludge".
- </dl>
- <p>
- Two notes regarding use of
- <b>yymore().</b>
- First,
- <b>yymore()</b>
- depends on the value of
- <i>yyleng</i>
- correctly reflecting the size of the current token, so you must not
- modify
- <i>yyleng</i>
- if you are using
- <b>yymore().</b>
- Second, the presence of
- <b>yymore()</b>
- in the scanner's action entails a minor performance penalty in the
- scanner's matching speed.
- <p><dl compact><dt>-<dd><b>yyless(n)</b>
- returns all but the first
- <i>n</i>
- characters of the current token back to the input stream, where they
- will be rescanned when the scanner looks for the next match.
- <b>yytext</b>
- and
- <b>yyleng</b>
- are adjusted appropriately (e.g.,
- <b>yyleng</b>
- will now be equal to
- <i>n</i>
- ). For example, on the input "foobar" the following will write out
- "foobarbar":
- <pre>
- <p><br> %%
- <br> foobar ECHO; yyless(3);
- <br> [a-z]+ ECHO;
- <br>
- <p><br></pre>
- An argument of 0 to
- <b>yyless</b>
- will cause the entire current input string to be scanned again. Unless you've
- changed how the scanner will subsequently process its input (using
- <b>BEGIN,</b>
- for example), this will result in an endless loop.
- </dl>
- <p>
- Note that
- <b>yyless</b>
- is a macro and can only be used in the flex input file, not from
- other source files.
- <p><dl compact><dt>-<dd><b>unput(c)</b>
- puts the character
- <i>c</i>
- back onto the input stream. It will be the next character scanned.
- The following action will take the current token and cause it
- to be rescanned enclosed in parentheses.
- <pre>
- <p><br> {
- <br> int i;
- <br> /* Copy yytext because unput() trashes yytext */
- <br> char *yycopy = strdup( yytext );
- <br> unput( ')' );
- <br> for ( i = yyleng - 1; i >= 0; --i )
- <br> unput( yycopy[i] );
- <br> unput( '(' );
- <br> free( yycopy );
- <br> }
- <br>
- <p><br></pre>
- Note that since each
- <b>unput()</b>
- puts the given character back at the
- <i>beginning</i>
- of the input stream, pushing back strings must be done back-to-front.
- </dl>
- <p>
- An important potential problem when using
- <b>unput()</b>
- is that if you are using
- <b>%pointer</b>
- (the default), a call to
- <b>unput()</b>
- <i>destroys</i>
- the contents of
- <i>yytext,</i>
- starting with its rightmost character and devouring one character to
- the left with each call. If you need the value of yytext preserved
- after a call to
- <b>unput()</b>
- (as in the above example),
- you must either first copy it elsewhere, or build your scanner using
- <b>%array</b>
- instead (see How The Input Is Matched).
- <p>
- Finally, note that you cannot put back
- <b>EOF</b>
- to attempt to mark the input stream with an end-of-file.
- <p><dl compact><dt>-<dd><b>input()</b>
- reads the next character from the input stream. For example,
- the following is one way to eat up C comments:
- <pre>
- <p><br> %%
- <br> "/*" {
- <br> register int c;
- <br>
- <p><br> for ( ; ; )
- <br> {
- <br> while ( (c = input()) != '*' &&
- <br> c != EOF )
- <br> ; /* eat up text of comment */
- <br>
- <p><br> if ( c == '*' )
- <br> {
- <br> while ( (c = input()) == '*' )
- <br> ;
- <br> if ( c == '/' )
- <br> break; /* found the end */
- <br> }
- <br>
- <p><br> if ( c == EOF )
- <br> {
- <br> error( "EOF in comment" );
- <br> break;
- <br> }
- <br> }
- <br> }
- <br>
- <p><br></pre>
- (Note that if the scanner is compiled using
- <b>C++,</b>
- then
- <b>input()</b>
- is instead referred to as
- <b>yyinput(),</b>
- in order to avoid a name clash with the
- <b>C++</b>
- stream by the name of
- <i>input.)</i>
- <dt>-<dd><b>YY_FLUSH_BUFFER</b>
- flushes the scanner's internal buffer
- so that the next time the scanner attempts to match a token, it will
- first refill the buffer using
- <b>YY_INPUT</b>
- (see The Generated Scanner, below). This action is a special case
- of the more general
- <b>yy_flush_buffer()</b>
- function, described below in the section Multiple Input Buffers.
- <dt>-<dd><b>yyterminate()</b>
- can be used in lieu of a return statement in an action. It terminates
- the scanner and returns a 0 to the scanner's caller, indicating "all done".
- By default,
- <b>yyterminate()</b>
- is also called when an end-of-file is encountered. It is a macro and
- may be redefined.
- </dl>
- </ul><H2>THE GENERATED SCANNER </H2><ul>
- The output of
- <i>flex</i>
- is the file
- <b>lex.yy.c,</b>
- which contains the scanning routine
- <b>yylex(),</b>
- a number of tables used by it for matching tokens, and a number
- of auxiliary routines and macros. By default,
- <b>yylex()</b>
- is declared as follows:
- <pre>
- <p><br> int yylex()
- <br> {
- <br> ... various definitions and the actions in here ...
- <br> }
- <br>
- <p><br></pre>
- (If your environment supports function prototypes, then it will
- be "int yylex( void )".) This definition may be changed by defining
- the "YY_DECL" macro. For example, you could use:
- <pre>
- <p><br> #define YY_DECL float lexscan( a, b ) float a, b;
- <br>
- <p><br></pre>
- to give the scanning routine the name
- <i>lexscan,</i>
- returning a float, and taking two floats as arguments. Note that
- if you give arguments to the scanning routine using a
- K&R-style/non-prototyped function declaration, you must terminate
- the definition with a semi-colon (;).
- <p>
- Whenever
- <b>yylex()</b>
- is called, it scans tokens from the global input file
- <i>yyin</i>
- (which defaults to stdin). It continues until it either reaches
- an end-of-file (at which point it returns the value 0) or
- one of its actions executes a
- <i>return</i>
- statement.
- <p>
- If the scanner reaches an end-of-file, subsequent calls are undefined
- unless either
- <i>yyin</i>
- is pointed at a new input file (in which case scanning continues from
- that file), or
- <b>yyrestart()</b>
- is called.
- <b>yyrestart()</b>
- takes one argument, a
- <b>FILE</b> <b>*</b>
- pointer (which can be nil, if you've set up
- <b>YY_INPUT</b>
- to scan from a source other than
- <i>yyin),</i>
- and initializes
- <i>yyin</i>
- for scanning from that file. Essentially there is no difference between
- just assigning
- <i>yyin</i>
- to a new input file or using
- <b>yyrestart()</b>
- to do so; the latter is available for compatibility with previous versions
- of
- <i>flex,</i>
- and because it can be used to switch input files in the middle of scanning.
- It can also be used to throw away the current input buffer, by calling
- it with an argument of
- <i>yyin;</i>
- but better is to use
- <b>YY_FLUSH_BUFFER</b>
- (see above).
- Note that
- <b>yyrestart()</b>
- does
- <i>not</i>
- reset the start condition to
- <b>INITIAL</b>
- (see Start Conditions, below).
- <p>
- If
- <b>yylex()</b>
- stops scanning due to executing a
- <i>return</i>
- statement in one of the actions, the scanner may then be called again and it
- will resume scanning where it left off.
- <p>
- By default (and for purposes of efficiency), the scanner uses
- block-reads rather than simple
- <i>getc()</i>
- calls to read characters from
- <i>yyin.</i>
- The nature of how it gets its input can be controlled by defining the
- <b>YY_INPUT</b>
- macro.
- YY_INPUT's calling sequence is "YY_INPUT(buf,result,max_size)". Its
- action is to place up to
- <i>max_size</i>
- characters in the character array
- <i>buf</i>
- and return in the integer variable
- <i>result</i>
- either the
- number of characters read or the constant YY_NULL (0 on Unix systems)
- to indicate EOF. The default YY_INPUT reads from the
- global file-pointer "yyin".
- <p>
- A sample definition of YY_INPUT (in the definitions
- section of the input file):
- <pre>
- <p><br> %{
- <br> #define YY_INPUT(buf,result,max_size) \
- <br> { \
- <br> int c = getchar(); \
- <br> result = (c == EOF) ? YY_NULL : (buf[0] = c, 1); \
- <br> }
- <br> %}
- <br>
- <p><br></pre>
- This definition will change the input processing to occur
- one character at a time.
- <p>
- When the scanner receives an end-of-file indication from YY_INPUT,
- it then checks the
- <b>yywrap()</b>
- function. If
- <b>yywrap()</b>
- returns false (zero), then it is assumed that the
- function has gone ahead and set up
- <i>yyin</i>
- to point to another input file, and scanning continues. If it returns
- true (non-zero), then the scanner terminates, returning 0 to its
- caller. Note that in either case, the start condition remains unchanged;
- it does
- <i>not</i>
- revert to
- <b>INITIAL.</b>
- <p>
- If you do not supply your own version of
- <b>yywrap(),</b>
- then you must either use
- <b>%option</b> <b>noyywrap</b>
- (in which case the scanner behaves as though
- <b>yywrap()</b>
- returned 1), or you must link with
- <b>-lfl</b>
- to obtain the default version of the routine, which always returns 1.
- <p>
- Three routines are available for scanning from in-memory buffers rather
- than files:
- <b>yy_scan_string(),</b> <b>yy_scan_bytes(),</b>
- and
- <b>yy_scan_buffer().</b>
- See the discussion of them below in the section Multiple Input Buffers.
- <p>
- The scanner writes its
- <b>ECHO</b>
- output to the
- <i>yyout</i>
- global (default, stdout), which may be redefined by the user simply
- by assigning it to some other
- <b>FILE</b>
- pointer.
- </ul><H2>START CONDITIONS </H2><ul>
- <i>flex</i>
- provides a mechanism for conditionally activating rules. Any rule
- whose pattern is prefixed with "<sc>" will only be active when
- the scanner is in the start condition named "sc". For example,
- <pre>
- <p><br> <STRING>[^"]* { /* eat up the string body ... */
- <br> ...
- <br> }
- <br>
- <p><br></pre>
- will be active only when the scanner is in the "STRING" start
- condition, and
- <pre>
- <p><br> <INITIAL,STRING,QUOTE>\. { /* handle an escape ... */
- <br> ...
- <br> }
- <br>
- <p><br></pre>
- will be active only when the current start condition is
- either "INITIAL", "STRING", or "QUOTE".
- <p>
- Start conditions
- are declared in the definitions (first) section of the input
- using unindented lines beginning with either
- <b>%s</b>
- or
- <b>%x</b>
- followed by a list of names.
- The former declares
- <i>inclusive</i>
- start conditions, the latter
- <i>exclusive</i>
- start conditions. A start condition is activated using the
- <b>BEGIN</b>
- action. Until the next
- <b>BEGIN</b>
- action is executed, rules with the given start
- condition will be active and
- rules with other start conditions will be inactive.
- If the start condition is
- <i>inclusive,</i>
- then rules with no start conditions at all will also be active.
- If it is
- <i>exclusive,</i>
- then
- <i>only</i>
- rules qualified with the start condition will be active.
- A set of rules contingent on the same exclusive start condition
- describe a scanner which is independent of any of the other rules in the
- <i>flex</i>
- input. Because of this,
- exclusive start conditions make it easy to specify "mini-scanners"
- which scan portions of the input that are syntactically different
- from the rest (e.g., comments).
- <p>
- If the distinction between inclusive and exclusive start conditions
- is still a little vague, here's a simple example illustrating the
- connection between the two. The set of rules:
- <pre>
- <p><br> %s example
- <br> %%
- <br>
- <p><br> <example>foo do_something();
- <br>
- <p><br> bar something_else();
- <br>
- <p><br></pre>
- is equivalent to
- <pre>
- <p><br> %x example
- <br> %%
- <br>
- <p><br> <example>foo do_something();
- <br>
- <p><br> <INITIAL,example>bar something_else();
- <br>
- <p><br></pre>
- Without the
- <b><INITIAL,example></b>
- qualifier, the
- <i>bar</i>
- pattern in the second example wouldn't be active (i.e., couldn't match)
- when in start condition
- <b>example.</b>
- If we just used
- <b><example></b>
- to qualify
- <i>bar,</i>
- though, then it would only be active in
- <b>example</b>
- and not in
- <b>INITIAL,</b>
- while in the first example it's active in both, because in the first
- example the
- <b>example</b>
- startion condition is an
- <i>inclusive</i>
- <b>(%s)</b>
- start condition.
- <p>
- Also note that the special start-condition specifier
- <b><*></b>
- matches every start condition. Thus, the above example could also
- have been written;
- <pre>
- <p><br> %x example
- <br> %%
- <br>
- <p><br> <example>foo do_something();
- <br>
- <p><br> <*>bar something_else();
- <br>
- <p><br></pre>
- <p>
- The default rule (to
- <b>ECHO</b>
- any unmatched character) remains active in start conditions. It
- is equivalent to:
- <pre>
- <p><br> <*>.|\n ECHO;
- <br>
- <p><br></pre>
- <p>
- <b>BEGIN(0)</b>
- returns to the original state where only the rules with
- no start conditions are active. This state can also be
- referred to as the start-condition "INITIAL", so
- <b>BEGIN(INITIAL)</b>
- is equivalent to
- <b>BEGIN(0).</b>
- (The parentheses around the start condition name are not required but
- are considered good style.)
- <p>
- <b>BEGIN</b>
- actions can also be given as indented code at the beginning
- of the rules section. For example, the following will cause
- the scanner to enter the "SPECIAL" start condition whenever
- <b>yylex()</b>
- is called and the global variable
- <i>enter_special</i>
- is true:
- <pre>
- <p><br> int enter_special;
- <br>
- <p><br> %x SPECIAL
- <br> %%
- <br> if ( enter_special )
- <br> BEGIN(SPECIAL);
- <br>
- <p><br> <SPECIAL>blahblahblah
- <br> ...more rules follow...
- <br>
- <p><br></pre>
- <p>
- To illustrate the uses of start conditions,
- here is a scanner which provides two different interpretations
- of a string like "123.456". By default it will treat it as
- as three tokens, the integer "123", a dot ('.'), and the integer "456".
- But if the string is preceded earlier in the line by the string
- "expect-floats"
- it will treat it as a single token, the floating-point number
- 123.456:
- <pre>
- <p><br> %{
- <br> #include <math.h>
- <br> %}
- <br> %s expect
- <br>
- <p><br> %%
- <br> expect-floats BEGIN(expect);
- <br>
- <p><br> <expect>[0-9]+"."[0-9]+ {
- <br> printf( "found a float, = %f\n",
- <br> atof( yytext ) );
- <br> }
- <br> <expect>\n {
- <br> /* that's the end of the line, so
- <br> * we need another "expect-number"
- <br> * before we'll recognize any more
- <br> * numbers
- <br> */
- <br> BEGIN(INITIAL);
- <br> }
- <br>
- <p><br> [0-9]+ {
- <br> printf( "found an integer, = %d\n",
- <br> atoi( yytext ) );
- <br> }
- <br>
- <p><br> "." printf( "found a dot\n" );
- <br>
- <p><br></pre>
- Here is a scanner which recognizes (and discards) C comments while
- maintaining a count of the current input line.
- <pre>
- <p><br> %x comment
- <br> %%
- <br> int line_num = 1;
- <br>
- <p><br> "/*" BEGIN(comment);
- <br>
- <p><br> <comment>[^*\n]* /* eat anything that's not a '*' */
- <br> <comment>"*"+[^*/\n]* /* eat up '*'s not followed by '/'s */
- <br> <comment>\n ++line_num;
- <br> <comment>"*"+"/" BEGIN(INITIAL);
- <br>
- <p><br></pre>
- This scanner goes to a bit of trouble to match as much
- text as possible with each rule. In general, when attempting to write
- a high-speed scanner try to match as much possible in each rule, as
- it's a big win.
- <p>
- Note that start-conditions names are really integer values and
- can be stored as such. Thus, the above could be extended in the
- following fashion:
- <pre>
- <p><br> %x comment foo
- <br> %%
- <br> int line_num = 1;
- <br> int comment_caller;
- <br>
- <p><br> "/*" {
- <br> comment_caller = INITIAL;
- <br> BEGIN(comment);
- <br> }
- <br>
- <p><br> ...
- <br>
- <p><br> <foo>"/*" {
- <br> comment_caller = foo;
- <br> BEGIN(comment);
- <br> }
- <br>
- <p><br> <comment>[^*\n]* /* eat anything that's not a '*' */
- <br> <comment>"*"+[^*/\n]* /* eat up '*'s not followed by '/'s */
- <br> <comment>\n ++line_num;
- <br> <comment>"*"+"/" BEGIN(comment_caller);
- <br>
- <p><br></pre>
- Furthermore, you can access the current start condition using
- the integer-valued
- <b>YY_START</b>
- macro. For example, the above assignments to
- <i>comment_caller</i>
- could instead be written
- <pre>
- <p><br> comment_caller = YY_START;
- <br>
- <p><br></pre>
- Flex provides
- <b>YYSTATE</b>
- as an alias for
- <b>YY_START</b>
- (since that is what's used by AT&T
- <i>lex).</i>
- <p>
- Note that start conditions do not have their own name-space; %s's and %x's
- declare names in the same fashion as #define's.
- <p>
- Finally, here's an example of how to match C-style quoted strings using
- exclusive start conditions, including expanded escape sequences (but
- not including checking for a string that's too long):
- <pre>
- <p><br> %x str
- <br>
- <p><br> %%
- <br> char string_buf[MAX_STR_CONST];
- <br> char *string_buf_ptr;
- <br>
- <p><br>
- <p><br> \" string_buf_ptr = string_buf; BEGIN(str);
- <br>
- <p><br> <str>\" { /* saw closing quote - all done */
- <br> BEGIN(INITIAL);
- <br> *string_buf_ptr = '\0';
- <br> /* return string constant token type and
- <br> * value to parser
- <br> */
- <br> }
- <br>
- <p><br> <str>\n {
- <br> /* error - unterminated string constant */
- <br> /* generate error message */
- <br> }
- <br>
- <p><br> <str>\\[0-7]{1,3} {
- <br> /* octal escape sequence */
- <br> int result;
- <br>
- <p><br> (void) sscanf( yytext + 1, "%o", &result );
- <br>
- <p><br> if ( result > 0xff )
- <br> /* error, constant is out-of-bounds */
- <br>
- <p><br> *string_buf_ptr++ = result;
- <br> }
- <br>
- <p><br> <str>\\[0-9]+ {
- <br> /* generate error - bad escape sequence; something
- <br> * like '\48' or '\0777777'
- <br> */
- <br> }
- <br>
- <p><br> <str>\\n *string_buf_ptr++ = '\n';
- <br> <str>\\t *string_buf_ptr++ = '\t';
- <br> <str>\\r *string_buf_ptr++ = '\r';
- <br> <str>\\b *string_buf_ptr++ = '\b';
- <br> <str>\\f *string_buf_ptr++ = '\f';
- <br>
- <p><br> <str>\\(.|\n) *string_buf_ptr++ = yytext[1];
- <br>
- <p><br> <str>[^\\\n\"]+ {
- <br> char *yptr = yytext;
- <br>
- <p><br> while ( *yptr )
- <br> *string_buf_ptr++ = *yptr++;
- <br> }
- <br>
- <p><br></pre>
- <p>
- Often, such as in some of the examples above, you wind up writing a
- whole bunch of rules all preceded by the same start condition(s). Flex
- makes this a little easier and cleaner by introducing a notion of
- start condition
- <i>scope.</i>
- A start condition scope is begun with:
- <pre>
- <p><br> <SCs>{
- <br>
- <p><br></pre>
- where
- <i>SCs</i>
- is a list of one or more start conditions. Inside the start condition
- scope, every rule automatically has the prefix
- <i><SCs></i>
- applied to it, until a
- <i>'}'</i>
- which matches the initial
- <i>'{'.</i>
- So, for example,
- <pre>
- <p><br> <ESC>{
- <br> "\\n" return '\n';
- <br> "\\r" return '\r';
- <br> "\\f" return '\f';
- <br> "\\0" return '\0';
- <br> }
- <br>
- <p><br></pre>
- is equivalent to:
- <pre>
- <p><br> <ESC>"\\n" return '\n';
- <br> <ESC>"\\r" return '\r';
- <br> <ESC>"\\f" return '\f';
- <br> <ESC>"\\0" return '\0';
- <br>
- <p><br></pre>
- Start condition scopes may be nested.
- <p>
- Three routines are available for manipulating stacks of start conditions:
- <p><dl compact><dt><b>void</b> <b>yy_push_state(int</b> <b>new_state)</b>
- <dd>pushes the current start condition onto the top of the start condition
- stack and switches to
- <i>new_state</i>
- as though you had used
- <b>BEGIN</b> <b>new_state</b>
- (recall that start condition names are also integers).
- <dt><b>void</b> <b>yy_pop_state()</b>
- <dd>pops the top of the stack and switches to it via
- <b>BEGIN.</b>
- <dt><b>int</b> <b>yy_top_state()</b>
- <dd>returns the top of the stack without altering the stack's contents.
- </dl>
- <p>
- The start condition stack grows dynamically and so has no built-in
- size limitation. If memory is exhausted, program execution aborts.
- <p>
- To use start condition stacks, your scanner must include a
- <b>%option</b> <b>stack</b>
- directive (see Options below).
- </ul><H2>MULTIPLE INPUT BUFFERS </H2><ul>
- Some scanners (such as those which support "include" files)
- require reading from several input streams. As
- <i>flex</i>
- scanners do a large amount of buffering, one cannot control
- where the next input will be read from by simply writing a
- <b>YY_INPUT</b>
- which is sensitive to the scanning context.
- <b>YY_INPUT</b>
- is only called when the scanner reaches the end of its buffer, which
- may be a long time after scanning a statement such as an "include"
- which requires switching the input source.
- <p>
- To negotiate these sorts of problems,
- <i>flex</i>
- provides a mechanism for creating and switching between multiple
- input buffers. An input buffer is created by using:
- <pre>
- <p><br> YY_BUFFER_STATE yy_create_buffer( FILE *file, int size )
- <br>
- <p><br></pre>
- which takes a
- <i>FILE</i>
- pointer and a size and creates a buffer associated with the given
- file and large enough to hold
- <i>size</i>
- characters (when in doubt, use
- <b>YY_BUF_SIZE</b>
- for the size). It returns a
- <b>YY_BUFFER_STATE</b>
- handle, which may then be passed to other routines (see below). The
- <b>YY_BUFFER_STATE</b>
- type is a pointer to an opaque
- <b>struct</b> <b>yy_buffer_state</b>
- structure, so you may safely initialize YY_BUFFER_STATE variables to
- <b>((YY_BUFFER_STATE)</b> <b>0)</b>
- if you wish, and also refer to the opaque structure in order to
- correctly declare input buffers in source files other than that
- of your scanner. Note that the
- <i>FILE</i>
- pointer in the call to
- <b>yy_create_buffer</b>
- is only used as the value of
- <i>yyin</i>
- seen by
- <b>YY_INPUT;</b>
- if you redefine
- <b>YY_INPUT</b>
- so it no longer uses
- <i>yyin,</i>
- then you can safely pass a nil
- <i>FILE</i>
- pointer to
- <b>yy_create_buffer.</b>
- You select a particular buffer to scan from using:
- <pre>
- <p><br> void yy_switch_to_buffer( YY_BUFFER_STATE new_buffer )
- <br>
- <p><br></pre>
- switches the scanner's input buffer so subsequent tokens will
- come from
- <i>new_buffer.</i>
- Note that
- <b>yy_switch_to_buffer()</b>
- may be used by yywrap() to set things up for continued scanning, instead
- of opening a new file and pointing
- <i>yyin</i>
- at it. Note also that switching input sources via either
- <b>yy_switch_to_buffer()</b>
- or
- <b>yywrap()</b>
- does
- <i>not</i>
- change the start condition.
- <pre>
- <p><br> void yy_delete_buffer( YY_BUFFER_STATE buffer )
- <br>
- <p><br></pre>
- is used to reclaim the storage associated with a buffer.
- You can also clear the current contents of a buffer using:
- <pre>
- <p><br> void yy_flush_buffer( YY_BUFFER_STATE buffer )
- <br>
- <p><br></pre>
- This function discards the buffer's contents,
- so the next time the scanner attempts to match a token from the
- buffer, it will first fill the buffer anew using
- <b>YY_INPUT.</b>
- <p>
- <b>yy_new_buffer()</b>
- is an alias for
- <b>yy_create_buffer(),</b>
- provided for compatibility with the C++ use of
- <i>new</i>
- and
- <i>delete</i>
- for creating and destroying dynamic objects.
- <p>
- Finally, the
- <b>YY_CURRENT_BUFFER</b>
- macro returns a
- <b>YY_BUFFER_STATE</b>
- handle to the current buffer.
- <p>
- Here is an example of using these features for writing a scanner
- which expands include files (the
- <b><<EOF>></b>
- feature is discussed below):
- <pre>
- <p><br> /* the "incl" state is used for picking up the name
- <br> * of an include file
- <br> */
- <br> %x incl
- <br>
- <p><br> %{
- <br> #define MAX_INCLUDE_DEPTH 10
- <br> YY_BUFFER_STATE include_stack[MAX_INCLUDE_DEPTH];
- <br> int include_stack_ptr = 0;
- <br> %}
- <br>
- <p><br> %%
- <br> include BEGIN(incl);
- <br>
- <p><br> [a-z]+ ECHO;
- <br> [^a-z\n]*\n? ECHO;
- <br>
- <p><br> <incl>[ \t]* /* eat the whitespace */
- <br> <incl>[^ \t\n]+ { /* got the include file name */
- <br> if ( include_stack_ptr >= MAX_INCLUDE_DEPTH )
- <br> {
- <br> fprintf( stderr, "Includes nested too deeply" );
- <br> exit( 1 );
- <br> }
- <br>
- <p><br> include_stack[include_stack_ptr++] =
- <br> YY_CURRENT_BUFFER;
- <br>
- <p><br> yyin = fopen( yytext, "r" );
- <br>
- <p><br> if ( ! yyin )
- <br> error( ... );
- <br>
- <p><br> yy_switch_to_buffer(
- <br> yy_create_buffer( yyin, YY_BUF_SIZE ) );
- <br>
- <p><br> BEGIN(INITIAL);
- <br> }
- <br>
- <p><br> <<EOF>> {
- <br> if ( --include_stack_ptr < 0 )
- <br> {
- <br> yyterminate();
- <br> }
- <br>
- <p><br> else
- <br> {
- <br> yy_delete_buffer( YY_CURRENT_BUFFER );
- <br> yy_switch_to_buffer(
- <br> include_stack[include_stack_ptr] );
- <br> }
- <br> }
- <br>
- <p><br></pre>
- Three routines are available for setting up input buffers for
- scanning in-memory strings instead of files. All of them create
- a new input buffer for scanning the string, and return a corresponding
- <b>YY_BUFFER_STATE</b>
- handle (which you should delete with
- <b>yy_delete_buffer()</b>
- when done with it). They also switch to the new buffer using
- <b>yy_switch_to_buffer(),</b>
- so the next call to
- <b>yylex()</b>
- will start scanning the string.
- <p><dl compact><dt><b>yy_scan_string(const</b> <b>char</b> <b>*str)</b>
- <dd>scans a NUL-terminated string.
- <dt><b>yy_scan_bytes(const</b> <b>char</b> <b>*bytes,</b> <b>int</b> <b>len)</b>
- <dd>scans
- <i>len</i>
- bytes (including possibly NUL's)
- starting at location
- <i>bytes.</i>
- </dl>
- <p>
- Note that both of these functions create and scan a
- <i>copy</i>
- of the string or bytes. (This may be desirable, since
- <b>yylex()</b>
- modifies the contents of the buffer it is scanning.) You can avoid the
- copy by using:
- <p><dl compact><dt><b>yy_scan_buffer(char</b> <b>*base,</b> <b>yy_size_t</b> <b>size)</b>
- <dd>which scans in place the buffer starting at
- <i>base,</i>
- consisting of
- <i>size</i>
- bytes, the last two bytes of which
- <i>must</i>
- be
- <b>YY_END_OF_BUFFER_CHAR</b>
- (ASCII NUL).
- These last two bytes are not scanned; thus, scanning
- consists of
- <b>base[0]</b>
- through
- <b>base[size-2],</b>
- inclusive.
- <dt><dd>If you fail to set up
- <i>base</i>
- in this manner (i.e., forget the final two
- <b>YY_END_OF_BUFFER_CHAR</b>
- bytes), then
- <b>yy_scan_buffer()</b>
- returns a nil pointer instead of creating a new input buffer.
- <dt><dd>The type
- <b>yy_size_t</b>
- is an integral type to which you can cast an integer expression
- reflecting the size of the buffer.
- </dl>
- </ul><H2>END-OF-FILE RULES </H2><ul>
- The special rule "<<EOF>>" indicates
- actions which are to be taken when an end-of-file is
- encountered and yywrap() returns non-zero (i.e., indicates
- no further files to process). The action must finish
- by doing one of four things:
- <p><dl compact><dt>-<dd>assigning
- <i>yyin</i>
- to a new input file (in previous versions of flex, after doing the
- assignment you had to call the special action
- <b>YY_NEW_FILE;</b>
- this is no longer necessary);
- <dt>-<dd>executing a
- <i>return</i>
- statement;
- <dt>-<dd>executing the special
- <b>yyterminate()</b>
- action;
- <dt>-<dd>or, switching to a new buffer using
- <b>yy_switch_to_buffer()</b>
- as shown in the example above.
- </dl>
- <p>
- <<EOF>> rules may not be used with other
- patterns; they may only be qualified with a list of start
- conditions. If an unqualified <<EOF>> rule is given, it
- applies to
- <i>all</i>
- start conditions which do not already have <<EOF>> actions. To
- specify an <<EOF>> rule for only the initial start condition, use
- <pre>
- <p><br> <INITIAL><<EOF>>
- <br>
- <p><br></pre>
- <p>
- These rules are useful for catching things like unclosed comments.
- An example:
- <pre>
- <p><br> %x quote
- <br> %%
- <br>
- <p><br> ...other rules for dealing with quotes...
- <br>
- <p><br> <quote><<EOF>> {
- <br> error( "unterminated quote" );
- <br> yyterminate();
- <br> }
- <br> <<EOF>> {
- <br> if ( *++filelist )
- <br> yyin = fopen( *filelist, "r" );
- <br> else
- <br> yyterminate();
- <br> }
- <br>
- <p><br></pre>
- </ul><H2>MISCELLANEOUS MACROS </H2><ul>
- The macro
- <b>YY_USER_ACTION</b>
- can be defined to provide an action
- which is always executed prior to the matched rule's action. For example,
- it could be #define'd to call a routine to convert yytext to lower-case.
- When
- <b>YY_USER_ACTION</b>
- is invoked, the variable
- <i>yy_act</i>
- gives the number of the matched rule (rules are numbered starting with 1).
- Suppose you want to profile how often each of your rules is matched. The
- following would do the trick:
- <pre>
- <p><br> #define YY_USER_ACTION ++ctr[yy_act]
- <br>
- <p><br></pre>
- where
- <i>ctr</i>
- is an array to hold the counts for the different rules. Note that
- the macro
- <b>YY_NUM_RULES</b>
- gives the total number of rules (including the default rule, even if
- you use
- <b>-s),</b>
- so a correct declaration for
- <i>ctr</i>
- is:
- <pre>
- <p><br> int ctr[YY_NUM_RULES];
- <br>
- <p><br></pre>
- <p>
- The macro
- <b>YY_USER_INIT</b>
- may be defined to provide an action which is always executed before
- the first scan (and before the scanner's internal initializations are done).
- For example, it could be used to call a routine to read
- in a data table or open a logging file.
- <p>
- The macro
- <b>yy_set_interactive(is_interactive)</b>
- can be used to control whether the current buffer is considered
- <i>interactive.</i>
- An interactive buffer is processed more slowly,
- but must be used when the scanner's input source is indeed
- interactive to avoid problems due to waiting to fill buffers
- (see the discussion of the
- <b>-I</b>
- flag below). A non-zero value
- in the macro invocation marks the buffer as interactive, a zero
- value as non-interactive. Note that use of this macro overrides
- <b>%option</b> <b>always-interactive</b>
- or
- <b>%option</b> <b>never-interactive</b>
- (see Options below).
- <b>yy_set_interactive()</b>
- must be invoked prior to beginning to scan the buffer that is
- (or is not) to be considered interactive.
- <p>
- The macro
- <b>yy_set_bol(at_bol)</b>
- can be used to control whether the current buffer's scanning
- context for the next token match is done as though at the
- beginning of a line. A non-zero macro argument makes rules anchored with
- '^' active, while a zero argument makes '^' rules inactive.
- <p>
- The macro
- <b>YY_AT_BOL()</b>
- returns true if the next token scanned from the current buffer
- will have '^' rules active, false otherwise.
- <p>
- In the generated scanner, the actions are all gathered in one large
- switch statement and separated using
- <b>YY_BREAK,</b>
- which may be redefined. By default, it is simply a "break", to separate
- each rule's action from the following rule's.
- Redefining
- <b>YY_BREAK</b>
- allows, for example, C++ users to
- #define YY_BREAK to do nothing (while being very careful that every
- rule ends with a "break" or a "return"!) to avoid suffering from
- unreachable statement warnings where because a rule's action ends with
- "return", the
- <b>YY_BREAK</b>
- is inaccessible.
- </ul><H2>VALUES AVAILABLE TO THE USER </H2><ul>
- This section summarizes the various values available to the user
- in the rule actions.
- <p><dl compact><dt>-<dd><b>char</b> <b>*yytext</b>
- holds the text of the current token. It may be modified but not lengthened
- (you cannot append characters to the end).
- <dt><dd>If the special directive
- <b>%array</b>
- appears in the first section of the scanner description, then
- <b>yytext</b>
- is instead declared
- <b>char</b> <b>yytext[YYLMAX],</b>
- where
- <b>YYLMAX</b>
- is a macro definition that you can redefine in the first section
- if you don't like the default value (generally 8KB). Using
- <b>%array</b>
- results in somewhat slower scanners, but the value of
- <b>yytext</b>
- becomes immune to calls to
- <i>input()</i>
- and
- <i>unput(),</i>
- which potentially destroy its value when
- <b>yytext</b>
- is a character pointer. The opposite of
- <b>%array</b>
- is
- <b>%pointer,</b>
- which is the default.
- <dt><dd>You cannot use
- <b>%array</b>
- when generating C++ scanner classes
- (the
- <b>-+</b>
- flag).
- <dt>-<dd><b>int</b> <b>yyleng</b>
- holds the length of the current token.
- <dt>-<dd><b>FILE</b> <b>*yyin</b>
- is the file which by default
- <i>flex</i>
- reads from. It may be redefined but doing so only makes sense before
- scanning begins or after an EOF has been encountered. Changing it in
- the midst of scanning will have unexpected results since
- <i>flex</i>
- buffers its input; use
- <b>yyrestart()</b>
- instead.
- Once scanning terminates because an end-of-file
- has been seen, you can assign
- <i>yyin</i>
- at the new input file and then call the scanner again to continue scanning.
- <dt>-<dd><b>void</b> <b>yyrestart(</b> <b>FILE</b> <b>*new_file</b> <b>)</b>
- may be called to point
- <i>yyin</i>
- at the new input file. The switch-over to the new file is immediate
- (any previously buffered-up input is lost). Note that calling
- <b>yyrestart()</b>
- with
- <i>yyin</i>
- as an argument thus throws away the current input buffer and continues
- scanning the same input file.
- <dt>-<dd><b>FILE</b> <b>*yyout</b>
- is the file to which
- <b>ECHO</b>
- actions are done. It can be reassigned by the user.
- <dt>-<dd><b>YY_CURRENT_BUFFER</b>
- returns a
- <b>YY_BUFFER_STATE</b>
- handle to the current buffer.
- <dt>-<dd><b>YY_START</b>
- returns an integer value corresponding to the current start
- condition. You can subsequently use this value with
- <b>BEGIN</b>
- to return to that start condition.
- </dl>
- </ul><H2>INTERFACING WITH YACC </H2><ul>
- One of the main uses of
- <i>flex</i>
- is as a companion to the
- <i>yacc</i>
- parser-generator.
- <i>yacc</i>
- parsers expect to call a routine named
- <b>yylex()</b>
- to find the next input token. The routine is supposed to
- return the type of the next token as well as putting any associated
- value in the global
- <b>yylval.</b>
- To use
- <i>flex</i>
- with
- <i>yacc,</i>
- one specifies the
- <b>-d</b>
- option to
- <i>yacc</i>
- to instruct it to generate the file
- <b>y.tab.h</b>
- containing definitions of all the
- <b>%tokens</b>
- appearing in the
- <i>yacc</i>
- input. This file is then included in the
- <i>flex</i>
- scanner. For example, if one of the tokens is "TOK_NUMBER",
- part of the scanner might look like:
- <pre>
- <p><br> %{
- <br> #include "y.tab.h"
- <br> %}
- <br>
- <p><br> %%
- <br>
- <p><br> [0-9]+ yylval = atoi( yytext ); return TOK_NUMBER;
- <br>
- <p><br></pre>
- </ul><H2>OPTIONS </H2><ul>
- <i>flex</i>
- has the following options:
- <p><dl compact><dt><b>-b</b>
- <dd>Generate backing-up information to
- <i>lex.backup.</i>
- This is a list of scanner states which require backing up
- and the input characters on which they do so. By adding rules one
- can remove backing-up states. If
- <i>all</i>
- backing-up states are eliminated and
- <b>-Cf</b>
- or
- <b>-CF</b>
- is used, the generated scanner will run faster (see the
- <b>-p</b>
- flag). Only users who wish to squeeze every last cycle out of their
- scanners need worry about this option. (See the section on Performance
- Considerations below.)
- <dt><b>-c</b>
- <dd>is a do-nothing, deprecated option included for POSIX compliance.
- <dt><b>-d</b>
- <dd>makes the generated scanner run in
- <i>debug</i>
- mode. Whenever a pattern is recognized and the global
- <b>yy_flex_debug</b>
- is non-zero (which is the default),
- the scanner will write to
- <i>stderr</i>
- a line of the form:
- <pre>
- <p><br> --accepting rule at line 53 ("the matched text")
- <br>
- <p><br></pre>
- The line number refers to the location of the rule in the file
- defining the scanner (i.e., the file that was fed to flex). Messages
- are also generated when the scanner backs up, accepts the
- default rule, reaches the end of its input buffer (or encounters
- a NUL; at this point, the two look the same as far as the scanner's concerned),
- or reaches an end-of-file.
- <dt><b>-f</b>
- <dd>specifies
- <i>fast</i> <i>scanner.</i>
- No table compression is done and stdio is bypassed.
- The result is large but fast. This option is equivalent to
- <b>-Cfr</b>
- (see below).
- <dt><b>-h</b>
- <dd>generates a "help" summary of
- <i>flex's</i>
- options to
- <i>stdout</i>
- and then exits.
- <b>-?</b>
- and
- <b>--help</b>
- are synonyms for
- <b>-h.</b>
- <dt><b>-i</b>
- <dd>instructs
- <i>flex</i>
- to generate a
- <i>case-insensitive</i>
- scanner. The case of letters given in the
- <i>flex</i>
- input patterns will
- be ignored, and tokens in the input will be matched regardless of case. The
- matched text given in
- <i>yytext</i>
- will have the preserved case (i.e., it will not be folded).
- <dt><b>-l</b>
- <dd>turns on maximum compatibility with the original AT&T
- <i>lex</i>
- implementation. Note that this does not mean
- <i>full</i>
- compatibility. Use of this option costs a considerable amount of
- performance, and it cannot be used with the
- <b>-+,</b> <b>-f,</b> <b>-F,</b> <b>-Cf,</b>
- or
- <b>-CF</b>
- options. For details on the compatibilities it provides, see the section
- "Incompatibilities With Lex And POSIX" below. This option also results
- in the name
- <b>YY_FLEX_LEX_COMPAT</b>
- being #define'd in the generated scanner.
- <dt><b>-n</b>
- <dd>is another do-nothing, deprecated option included only for
- POSIX compliance.
- <dt><b>-p</b>
- <dd>generates a performance report to stderr. The report
- consists of comments regarding features of the
- <i>flex</i>
- input file which will cause a serious loss of performance in the resulting
- scanner. If you give the flag twice, you will also get comments regarding
- features that lead to minor performance losses.
- <dt><dd>Note that the use of
- <b>REJECT,</b>
- <b>%option</b> <b>yylineno,</b>
- and variable trailing context (see the Deficiencies / Bugs section below)
- entails a substantial performance penalty; use of
- <i>yymore(),</i>
- the
- <b>^</b>
- operator,
- and the
- <b>-I</b>
- flag entail minor performance penalties.
- <dt><b>-s</b>
- <dd>causes the
- <i>default</i> <i>rule</i>
- (that unmatched scanner input is echoed to
- <i>stdout)</i>
- to be suppressed. If the scanner encounters input that does not
- match any of its rules, it aborts with an error. This option is
- useful for finding holes in a scanner's rule set.
- <dt><b>-t</b>
- <dd>instructs
- <i>flex</i>
- to write the scanner it generates to standard output instead
- of
- <b>lex.yy.c.</b>
- <dt><b>-v</b>
- <dd>specifies that
- <i>flex</i>
- should write to
- <i>stderr</i>
- a summary of statistics regarding the scanner it generates.
- Most of the statistics are meaningless to the casual
- <i>flex</i>
- user, but the first line identifies the version of
- <i>flex</i>
- (same as reported by
- <b>-V),</b>
- and the next line the flags used when generating the scanner, including
- those that are on by default.
- <dt><b>-w</b>
- <dd>suppresses warning messages.
- <dt><b>-B</b>
- <dd>instructs
- <i>flex</i>
- to generate a
- <i>batch</i>
- scanner, the opposite of
- <i>interactive</i>
- scanners generated by
- <b>-I</b>
- (see below). In general, you use
- <b>-B</b>
- when you are
- <i>certain</i>
- that your scanner will never be used interactively, and you want to
- squeeze a
- <i>little</i>
- more performance out of it. If your goal is instead to squeeze out a
- <i>lot</i>
- more performance, you should be using the
- <b>-Cf</b>
- or
- <b>-CF</b>
- options (discussed below), which turn on
- <b>-B</b>
- automatically anyway.
- <dt><b>-F</b>
- <dd>specifies that the
- fast
- scanner table representation should be used (and stdio
- bypassed). This representation is
- about as fast as the full table representation
- <b>(-f),</b>
- and for some sets of patterns will be considerably smaller (and for
- others, larger). In general, if the pattern set contains both "keywords"
- and a catch-all, "identifier" rule, such as in the set:
- <pre>
- <p><br> "case" return TOK_CASE;
- <br> "switch" return TOK_SWITCH;
- <br> ...
- <br> "default" return TOK_DEFAULT;
- <br> [a-z]+ return TOK_ID;
- <br>
- <p><br></pre>
- then you're better off using the full table representation. If only
- the "identifier" rule is present and you then use a hash table or some such
- to detect the keywords, you're better off using
- <b>-F.</b>
- <dt><dd>This option is equivalent to
- <b>-CFr</b>
- (see below). It cannot be used with
- <b>-+.</b>
- <dt><b>-I</b>
- <dd>instructs
- <i>flex</i>
- to generate an
- <i>interactive</i>
- scanner. An interactive scanner is one that only looks ahead to decide
- what token has been matched if it absolutely must. It turns out that
- always looking one extra character ahead, even if the scanner has already
- seen enough text to disambiguate the current token, is a bit faster than
- only looking ahead when necessary. But scanners that always look ahead
- give dreadful interactive performance; for example, when a user types
- a newline, it is not recognized as a newline token until they enter
- <i>another</i>
- token, which often means typing in another whole line.
- <dt><dd><i>Flex</i>
- scanners default to
- <i>interactive</i>
- unless you use the
- <b>-Cf</b>
- or
- <b>-CF</b>
- table-compression options (see below). That's because if you're looking
- for high-performance you should be using one of these options, so if you
- didn't,
- <i>flex</i>
- assumes you'd rather trade off a bit of run-time performance for intuitive
- interactive behavior. Note also that you
- <i>cannot</i>
- use
- <b>-I</b>
- in conjunction with
- <b>-Cf</b>
- or
- <b>-CF.</b>
- Thus, this option is not really needed; it is on by default for all those
- cases in which it is allowed.
- <dt><dd>You can force a scanner to
- <i>not</i>
- be interactive by using
- <b>-B</b>
- (see above).
- <dt><b>-L</b>
- <dd>instructs
- <i>flex</i>
- not to generate
- <b>#line</b>
- directives. Without this option,
- <i>flex</i>
- peppers the generated scanner
- with #line directives so error messages in the actions will be correctly
- located with respect to either the original
- <i>flex</i>
- input file (if the errors are due to code in the input file), or
- <b>lex.yy.c</b>
- (if the errors are
- <i>flex's</i>
- fault -- you should report these sorts of errors to the email address
- given below).
- <dt><b>-T</b>
- <dd>makes
- <i>flex</i>
- run in
- <i>trace</i>
- mode. It will generate a lot of messages to
- <i>stderr</i>
- concerning
- the form of the input and the resultant non-deterministic and deterministic
- finite automata. This option is mostly for use in maintaining
- <i>flex.</i>
- <dt><b>-V</b>
- <dd>prints the version number to
- <i>stdout</i>
- and exits.
- <b>--version</b>
- is a synonym for
- <b>-V.</b>
- <dt><b>-7</b>
- <dd>instructs
- <i>flex</i>
- to generate a 7-bit scanner, i.e., one which can only recognized 7-bit
- characters in its input. The advantage of using
- <b>-7</b>
- is that the scanner's tables can be up to half the size of those generated
- using the
- <b>-8</b>
- option (see below). The disadvantage is that such scanners often hang
- or crash if their input contains an 8-bit character.
- <dt><dd>Note, however, that unless you generate your scanner using the
- <b>-Cf</b>
- or
- <b>-CF</b>
- table compression options, use of
- <b>-7</b>
- will save only a small amount of table space, and make your scanner
- considerably less portable.
- <i>Flex's</i>
- default behavior is to generate an 8-bit scanner unless you use the
- <b>-Cf</b>
- or
- <b>-CF,</b>
- in which case
- <i>flex</i>
- defaults to generating 7-bit scanners unless your site was always
- configured to generate 8-bit scanners (as will often be the case
- with non-USA sites). You can tell whether flex generated a 7-bit
- or an 8-bit scanner by inspecting the flag summary in the
- <b>-v</b>
- output as described above.
- <dt><dd>Note that if you use
- <b>-Cfe</b>
- or
- <b>-CFe</b>
- (those table compression options, but also using equivalence classes as
- discussed see below), flex still defaults to generating an 8-bit
- scanner, since usually with these compression options full 8-bit tables
- are not much more expensive than 7-bit tables.
- <dt><b>-8</b>
- <dd>instructs
- <i>flex</i>
- to generate an 8-bit scanner, i.e., one which can recognize 8-bit
- characters. This flag is only needed for scanners generated using
- <b>-Cf</b>
- or
- <b>-CF,</b>
- as otherwise flex defaults to generating an 8-bit scanner anyway.
- <dt><dd>See the discussion of
- <b>-7</b>
- above for flex's default behavior and the tradeoffs between 7-bit
- and 8-bit scanners.
- <dt><b>-+</b>
- <dd>specifies that you want flex to generate a C++
- scanner class. See the section on Generating C++ Scanners below for
- details.
- <dt><b>-C[aefFmr]</b>
- <dd>controls the degree of table compression and, more generally, trade-offs
- between small scanners and fast scanners.
- <dt><dd><b>-Ca</b>
- ("align") instructs flex to trade off larger tables in the
- generated scanner for faster performance because the elements of
- the tables are better aligned for memory access and computation. On some
- RISC architectures, fetching and manipulating longwords is more efficient
- than with smaller-sized units such as shortwords. This option can
- double the size of the tables used by your scanner.
- <dt><dd><b>-Ce</b>
- directs
- <i>flex</i>
- to construct
- <i>equivalence</i> <i>classes,</i>
- i.e., sets of characters
- which have identical lexical properties (for example, if the only
- appearance of digits in the
- <i>flex</i>
- input is in the character class
- "[0-9]" then the digits '0', '1', ..., '9' will all be put
- in the same equivalence class). Equivalence classes usually give
- dramatic reductions in the final table/object file sizes (typically
- a factor of 2-5) and are pretty cheap performance-wise (one array
- look-up per character scanned).
- <dt><dd><b>-Cf</b>
- specifies that the
- <i>full</i>
- scanner tables should be generated -
- <i>flex</i>
- should not compress the
- tables by taking advantages of similar transition functions for
- different states.
- <dt><dd><b>-CF</b>
- specifies that the alternate fast scanner representation (described
- above under the
- <b>-F</b>
- flag)
- should be used. This option cannot be used with
- <b>-+.</b>
- <dt><dd><b>-Cm</b>
- directs
- <i>flex</i>
- to construct
- <i>meta-equivalence</i> <i>classes,</i>
- which are sets of equivalence classes (or characters, if equivalence
- classes are not being used) that are commonly used together. Meta-equivalence
- classes are often a big win when using compressed tables, but they
- have a moderate performance impact (one or two "if" tests and one
- array look-up per character scanned).
- <dt><dd><b>-Cr</b>
- causes the generated scanner to
- <i>bypass</i>
- use of the standard I/O library (stdio) for input. Instead of calling
- <b>fread()</b>
- or
- <b>getc(),</b>
- the scanner will use the
- <b>read()</b>
- system call, resulting in a performance gain which varies from system
- to system, but in general is probably negligible unless you are also using
- <b>-Cf</b>
- or
- <b>-CF.</b>
- Using
- <b>-Cr</b>
- can cause strange behavior if, for example, you read from
- <i>yyin</i>
- using stdio prior to calling the scanner (because the scanner will miss
- whatever text your previous reads left in the stdio input buffer).
- <dt><dd><b>-Cr</b>
- has no effect if you define
- <b>YY_INPUT</b>
- (see The Generated Scanner above).
- <dt><dd>A lone
- <b>-C</b>
- specifies that the scanner tables should be compressed but neither
- equivalence classes nor meta-equivalence classes should be used.
- <dt><dd>The options
- <b>-Cf</b>
- or
- <b>-CF</b>
- and
- <b>-Cm</b>
- do not make sense together - there is no opportunity for meta-equivalence
- classes if the table is not being compressed. Otherwise the options
- may be freely mixed, and are cumulative.
- <dt><dd>The default setting is
- <b>-Cem,</b>
- which specifies that
- <i>flex</i>
- should generate equivalence classes
- and meta-equivalence classes. This setting provides the highest
- degree of table compression. You can trade off
- faster-executing scanners at the cost of larger tables with
- the following generally being true:
- <pre>
- <p><br> slowest & smallest
- <br> -Cem
- <br> -Cm
- <br> -Ce
- <br> -C
- <br> -C{f,F}e
- <br> -C{f,F}
- <br> -C{f,F}a
- <br> fastest & largest
- <br>
- <p><br></pre>
- Note that scanners with the smallest tables are usually generated and
- compiled the quickest, so
- during development you will usually want to use the default, maximal
- compression.
- <dt><dd><b>-Cfe</b>
- is often a good compromise between speed and size for production
- scanners.
- <dt><b>-ooutput</b>
- <dd>directs flex to write the scanner to the file
- <b>output</b>
- instead of
- <b>lex.yy.c.</b>
- If you combine
- <b>-o</b>
- with the
- <b>-t</b>
- option, then the scanner is written to
- <i>stdout</i>
- but its
- <b>#line</b>
- directives (see the
- <b>\-L</b>
- option above) refer to the file
- <b>output.</b>
- <dt><b>-Pprefix</b>
- <dd>changes the default
- <i>yy</i>
- prefix used by
- <i>flex</i>
- for all globally-visible variable and function names to instead be
- <i>prefix.</i>
- For example,
- <b>-Pfoo</b>
- changes the name of
- <b>yytext</b>
- to
- <b>footext.</b>
- It also changes the name of the default output file from
- <b>lex.yy.c</b>
- to
- <b>lex.foo.c.</b>
- Here are all of the names affected:
- <pre>
- <p><br> yy_create_buffer
- <br> yy_delete_buffer
- <br> yy_flex_debug
- <br> yy_init_buffer
- <br> yy_flush_buffer
- <br> yy_load_buffer_state
- <br> yy_switch_to_buffer
- <br> yyin
- <br> yyleng
- <br> yylex
- <br> yylineno
- <br> yyout
- <br> yyrestart
- <br> yytext
- <br> yywrap
- <br>
- <p><br></pre>
- (If you are using a C++ scanner, then only
- <b>yywrap</b>
- and
- <b>yyFlexLexer</b>
- are affected.)
- Within your scanner itself, you can still refer to the global variables
- and functions using either version of their name; but externally, they
- have the modified name.
- <dt><dd>This option lets you easily link together multiple
- <i>flex</i>
- programs into the same executable. Note, though, that using this
- option also renames
- <b>yywrap(),</b>
- so you now
- <i>must</i>
- either
- provide your own (appropriately-named) version of the routine for your
- scanner, or use
- <b>%option</b> <b>noyywrap,</b>
- as linking with
- <b>-lfl</b>
- no longer provides one for you by default.
- <dt><b>-Sskeleton_file</b>
- <dd>overrides the default skeleton file from which
- <i>flex</i>
- constructs its scanners. You'll never need this option unless you are doing
- <i>flex</i>
- maintenance or development.
- </dl>
- <p>
- <i>flex</i>
- also provides a mechanism for controlling options within the
- scanner specification itself, rather than from the flex command-line.
- This is done by including
- <b>%option</b>
- directives in the first section of the scanner specification.
- You can specify multiple options with a single
- <b>%option</b>
- directive, and multiple directives in the first section of your flex input
- file. Most
- options are given simply as names, optionally preceded by the
- word "no" (with no intervening whitespace) to negate their meaning.
- A number are equivalent to flex flags or their negation:
- <pre>
- <p><br> 7bit -7 option
- <br> 8bit -8 option
- <br> align -Ca option
- <br> backup -b option
- <br> batch -B option
- <br> c++ -+ option
- <br>
- <p><br> caseful or
- <br> case-sensitive opposite of -i (default)
- <br>
- <p><br> case-insensitive or
- <br> caseless -i option
- <br>
- <p><br> debug -d option
- <br> default opposite of -s option
- <br> ecs -Ce option
- <br> fast -F option
- <br> full -f option
- <br> interactive -I option
- <br> lex-compat -l option
- <br> meta-ecs -Cm option
- <br> perf-report -p option
- <br> read -Cr option
- <br> stdout -t option
- <br> verbose -v option
- <br> warn opposite of -w option
- <br> (use "%option nowarn" for -w)
- <br>
- <p><br> array equivalent to "%array"
- <br> pointer equivalent to "%pointer" (default)
- <br>
- <p><br></pre>
- Some
- <b>%option's</b>
- provide features otherwise not available:
- <p><dl compact><dt><b>always-interactive</b>
- <dd>instructs flex to generate a scanner which always considers its input
- "interactive". Normally, on each new input file the scanner calls
- <b>isatty()</b>
- in an attempt to determine whether
- the scanner's input source is interactive and thus should be read a
- character at a time. When this option is used, however, then no
- such call is made.
- <dt><b>main</b>
- <dd>directs flex to provide a default
- <b>main()</b>
- program for the scanner, which simply calls
- <b>yylex().</b>
- This option implies
- <b>noyywrap</b>
- (see below).
- <dt><b>never-interactive</b>
- <dd>instructs flex to generate a scanner which never considers its input
- "interactive" (again, no call made to
- <b>isatty()).</b>
- This is the opposite of
- <b>always-interactive.</b>
- <dt><b>stack</b>
- <dd>enables the use of start condition stacks (see Start Conditions above).
- <dt><b>stdinit</b>
- <dd>if unset (i.e.,
- <b>%option</b> <b>nostdinit)</b>
- initializes
- <i>yyin</i>
- and
- <i>yyout</i>
- to nil
- <i>FILE</i>
- pointers, instead of
- <i>stdin</i>
- and
- <i>stdout.</i>
- <dt><b>yylineno</b>
- <dd>directs
- <i>flex</i>
- to generate a scanner that maintains the number of the current line
- read from its input in the global variable
- <b>yylineno.</b>
- This option is implied by
- <b>%option</b> <b>lex-compat.</b>
- <dt><b>yywrap</b>
- <dd>if unset (i.e.,
- <b>%option</b> <b>noyywrap),</b>
- makes the scanner not call
- <b>yywrap()</b>
- upon an end-of-file, but simply assume that there are no more
- files to scan (until the user points
- <i>yyin</i>
- at a new file and calls
- <b>yylex()</b>
- again).
- </dl>
- <p>
- <i>flex</i>
- scans your rule actions to determine whether you use the
- <b>REJECT</b>
- or
- <b>yymore()</b>
- features. The
- <b>reject</b>
- and
- <b>yymore</b>
- options are available to override its decision as to whether you use the
- options, either by setting them (e.g.,
- <b>%option</b> <b>reject)</b>
- to indicate the feature is indeed used, or
- unsetting them to indicate it actually is not used
- (e.g.,
- <b>%option</b> <b>noyymore).</b>
- <p>
- Three options take string-delimited values, offset with '=':
- <pre>
- <p><br> %option outfile="ABC"
- <br>
- <p><br></pre>
- is equivalent to
- <b>-oABC,</b>
- and
- <pre>
- <p><br> %option prefix="XYZ"
- <br>
- <p><br></pre>
- is equivalent to
- <b>-PXYZ.</b>
- Finally,
- <pre>
- <p><br> %option yyclass="foo"
- <br>
- <p><br></pre>
- only applies when generating a C++ scanner (
- <b>-+</b>
- option). It informs
- <i>flex</i>
- that you have derived
- <b>foo</b>
- as a subclass of
- <b>yyFlexLexer,</b>
- so
- <i>flex</i>
- will place your actions in the member function
- <b>foo::yylex()</b>
- instead of
- <b>yyFlexLexer::yylex().</b>
- It also generates a
- <b>yyFlexLexer::yylex()</b>
- member function that emits a run-time error (by invoking
- <b>yyFlexLexer::LexerError())</b>
- if called.
- See Generating C++ Scanners, below, for additional information.
- <p>
- A number of options are available for lint purists who want to suppress
- the appearance of unneeded routines in the generated scanner. Each of the
- following, if unset, results in the corresponding routine not appearing in
- the generated scanner:
- <pre>
- <p><br> input, unput
- <br> yy_push_state, yy_pop_state, yy_top_state
- <br> yy_scan_buffer, yy_scan_bytes, yy_scan_string
- <br>
- <p><br></pre>
- (though
- <b>yy_push_state()</b>
- and friends won't appear anyway unless you use
- <b>%option</b> <b>stack).</b>
- </ul><H2>PERFORMANCE CONSIDERATIONS </H2><ul>
- The main design goal of
- <i>flex</i>
- is that it generate high-performance scanners. It has been optimized
- for dealing well with large sets of rules. Aside from the effects on
- scanner speed of the table compression
- <b>-C</b>
- options outlined above,
- there are a number of options/actions which degrade performance. These
- are, from most expensive to least:
- <pre>
- <p><br> REJECT
- <br> %option yylineno
- <br> arbitrary trailing context
- <br>
- <p><br> pattern sets that require backing up
- <br> %array
- <br> %option interactive
- <br> %option always-interactive
- <br>
- <p><br> '^' beginning-of-line operator
- <br> yymore()
- <br>
- <p><br></pre>
- with the first three all being quite expensive and the last two
- being quite cheap. Note also that
- <b>unput()</b>
- is implemented as a routine call that potentially does quite a bit of
- work, while
- <b>yyless()</b>
- is a quite-cheap macro; so if just putting back some excess text you
- scanned, use
- <b>yyless().</b>
- <p>
- <b>REJECT</b>
- should be avoided at all costs when performance is important.
- It is a particularly expensive option.
- <p>
- Getting rid of backing up is messy and often may be an enormous
- amount of work for a complicated scanner. In principal, one begins
- by using the
- <b>-b</b>
- flag to generate a
- <i>lex.backup</i>
- file. For example, on the input
- <pre>
- <p><br> %%
- <br> foo return TOK_KEYWORD;
- <br> foobar return TOK_KEYWORD;
- <br>
- <p><br></pre>
- the file looks like:
- <pre>
- <p><br> State #6 is non-accepting -
- <br> associated rule line numbers:
- <br> 2 3
- <br> out-transitions: [ o ]
- <br> jam-transitions: EOF [ \001-n p-\177 ]
- <br>
- <p><br> State #8 is non-accepting -
- <br> associated rule line numbers:
- <br> 3
- <br> out-transitions: [ a ]
- <br> jam-transitions: EOF [ \001-` b-\177 ]
- <br>
- <p><br> State #9 is non-accepting -
- <br> associated rule line numbers:
- <br> 3
- <br> out-transitions: [ r ]
- <br> jam-transitions: EOF [ \001-q s-\177 ]
- <br>
- <p><br> Compressed tables always back up.
- <br>
- <p><br></pre>
- The first few lines tell us that there's a scanner state in
- which it can make a transition on an 'o' but not on any other
- character, and that in that state the currently scanned text does not match
- any rule. The state occurs when trying to match the rules found
- at lines 2 and 3 in the input file.
- If the scanner is in that state and then reads
- something other than an 'o', it will have to back up to find
- a rule which is matched. With
- a bit of headscratching one can see that this must be the
- state it's in when it has seen "fo". When this has happened,
- if anything other than another 'o' is seen, the scanner will
- have to back up to simply match the 'f' (by the default rule).
- <p>
- The comment regarding State #8 indicates there's a problem
- when "foob" has been scanned. Indeed, on any character other
- than an 'a', the scanner will have to back up to accept "foo".
- Similarly, the comment for State #9 concerns when "fooba" has
- been scanned and an 'r' does not follow.
- <p>
- The final comment reminds us that there's no point going to
- all the trouble of removing backing up from the rules unless
- we're using
- <b>-Cf</b>
- or
- <b>-CF,</b>
- since there's no performance gain doing so with compressed scanners.
- <p>
- The way to remove the backing up is to add "error" rules:
- <pre>
- <p><br> %%
- <br> foo return TOK_KEYWORD;
- <br> foobar return TOK_KEYWORD;
- <br>
- <p><br> fooba |
- <br> foob |
- <br> fo {
- <br> /* false alarm, not really a keyword */
- <br> return TOK_ID;
- <br> }
- <br>
- <p><br></pre>
- <p>
- Eliminating backing up among a list of keywords can also be
- done using a "catch-all" rule:
- <pre>
- <p><br> %%
- <br> foo return TOK_KEYWORD;
- <br> foobar return TOK_KEYWORD;
- <br>
- <p><br> [a-z]+ return TOK_ID;
- <br>
- <p><br></pre>
- This is usually the best solution when appropriate.
- <p>
- Backing up messages tend to cascade.
- With a complicated set of rules it's not uncommon to get hundreds
- of messages. If one can decipher them, though, it often
- only takes a dozen or so rules to eliminate the backing up (though
- it's easy to make a mistake and have an error rule accidentally match
- a valid token. A possible future
- <i>flex</i>
- feature will be to automatically add rules to eliminate backing up).
- <p>
- It's important to keep in mind that you gain the benefits of eliminating
- backing up only if you eliminate
- <i>every</i>
- instance of backing up. Leaving just one means you gain nothing.
- <p>
- <i>Variable</i>
- trailing context (where both the leading and trailing parts do not have
- a fixed length) entails almost the same performance loss as
- <b>REJECT</b>
- (i.e., substantial). So when possible a rule like:
- <pre>
- <p><br> %%
- <br> mouse|rat/(cat|dog) run();
- <br>
- <p><br></pre>
- is better written:
- <pre>
- <p><br> %%
- <br> mouse/cat|dog run();
- <br> rat/cat|dog run();
- <br>
- <p><br></pre>
- or as
- <pre>
- <p><br> %%
- <br> mouse|rat/cat run();
- <br> mouse|rat/dog run();
- <br>
- <p><br></pre>
- Note that here the special '|' action does
- <i>not</i>
- provide any savings, and can even make things worse (see
- Deficiencies / Bugs below).
- <p>
- Another area where the user can increase a scanner's performance
- (and one that's easier to implement) arises from the fact that
- the longer the tokens matched, the faster the scanner will run.
- This is because with long tokens the processing of most input
- characters takes place in the (short) inner scanning loop, and
- does not often have to go through the additional work of setting up
- the scanning environment (e.g.,
- <b>yytext)</b>
- for the action. Recall the scanner for C comments:
- <pre>
- <p><br> %x comment
- <br> %%
- <br> int line_num = 1;
- <br>
- <p><br> "/*" BEGIN(comment);
- <br>
- <p><br> <comment>[^*\n]*
- <br> <comment>"*"+[^*/\n]*
- <br> <comment>\n ++line_num;
- <br> <comment>"*"+"/" BEGIN(INITIAL);
- <br>
- <p><br></pre>
- This could be sped up by writing it as:
- <pre>
- <p><br> %x comment
- <br> %%
- <br> int line_num = 1;
- <br>
- <p><br> "/*" BEGIN(comment);
- <br>
- <p><br> <comment>[^*\n]*
- <br> <comment>[^*\n]*\n ++line_num;
- <br> <comment>"*"+[^*/\n]*
- <br> <comment>"*"+[^*/\n]*\n ++line_num;
- <br> <comment>"*"+"/" BEGIN(INITIAL);
- <br>
- <p><br></pre>
- Now instead of each newline requiring the processing of another
- action, recognizing the newlines is "distributed" over the other rules
- to keep the matched text as long as possible. Note that
- <i>adding</i>
- rules does
- <i>not</i>
- slow down the scanner! The speed of the scanner is independent
- of the number of rules or (modulo the considerations given at the
- beginning of this section) how complicated the rules are with
- regard to operators such as '*' and '|'.
- <p>
- A final example in speeding up a scanner: suppose you want to scan
- through a file containing identifiers and keywords, one per line
- and with no other extraneous characters, and recognize all the
- keywords. A natural first approach is:
- <pre>
- <p><br> %%
- <br> asm |
- <br> auto |
- <br> break |
- <br> ... etc ...
- <br> volatile |
- <br> while /* it's a keyword */
- <br>
- <p><br> .|\n /* it's not a keyword */
- <br>
- <p><br></pre>
- To eliminate the back-tracking, introduce a catch-all rule:
- <pre>
- <p><br> %%
- <br> asm |
- <br> auto |
- <br> break |
- <br> ... etc ...
- <br> volatile |
- <br> while /* it's a keyword */
- <br>
- <p><br> [a-z]+ |
- <br> .|\n /* it's not a keyword */
- <br>
- <p><br></pre>
- Now, if it's guaranteed that there's exactly one word per line,
- then we can reduce the total number of matches by a half by
- merging in the recognition of newlines with that of the other
- tokens:
- <pre>
- <p><br> %%
- <br> asm\n |
- <br> auto\n |
- <br> break\n |
- <br> ... etc ...
- <br> volatile\n |
- <br> while\n /* it's a keyword */
- <br>
- <p><br> [a-z]+\n |
- <br> .|\n /* it's not a keyword */
- <br>
- <p><br></pre>
- One has to be careful here, as we have now reintroduced backing up
- into the scanner. In particular, while
- <i>we</i>
- know that there will never be any characters in the input stream
- other than letters or newlines,
- <i>flex</i>
- can't figure this out, and it will plan for possibly needing to back up
- when it has scanned a token like "auto" and then the next character
- is something other than a newline or a letter. Previously it would
- then just match the "auto" rule and be done, but now it has no "auto"
- rule, only a "auto\n" rule. To eliminate the possibility of backing up,
- we could either duplicate all rules but without final newlines, or,
- since we never expect to encounter such an input and therefore don't
- how it's classified, we can introduce one more catch-all rule, this
- one which doesn't include a newline:
- <pre>
- <p><br> %%
- <br> asm\n |
- <br> auto\n |
- <br> break\n |
- <br> ... etc ...
- <br> volatile\n |
- <br> while\n /* it's a keyword */
- <br>
- <p><br> [a-z]+\n |
- <br> [a-z]+ |
- <br> .|\n /* it's not a keyword */
- <br>
- <p><br></pre>
- Compiled with
- <b>-Cf,</b>
- this is about as fast as one can get a
- <i>flex</i>
- scanner to go for this particular problem.
- <p>
- A final note:
- <i>flex</i>
- is slow when matching NUL's, particularly when a token contains
- multiple NUL's.
- It's best to write rules which match
- <i>short</i>
- amounts of text if it's anticipated that the text will often include NUL's.
- <p>
- Another final note regarding performance: as mentioned above in the section
- How the Input is Matched, dynamically resizing
- <b>yytext</b>
- to accommodate huge tokens is a slow process because it presently requires that
- the (huge) token be rescanned from the beginning. Thus if performance is
- vital, you should attempt to match "large" quantities of text but not
- "huge" quantities, where the cutoff between the two is at about 8K
- characters/token.
- </ul><H2>GENERATING C++ SCANNERS </H2><ul>
- <i>flex</i>
- provides two different ways to generate scanners for use with C++. The
- first way is to simply compile a scanner generated by
- <i>flex</i>
- using a C++ compiler instead of a C compiler. You should not encounter
- any compilations errors (please report any you find to the email address
- given in the Author section below). You can then use C++ code in your
- rule actions instead of C code. Note that the default input source for
- your scanner remains
- <i>yyin,</i>
- and default echoing is still done to
- <i>yyout.</i>
- Both of these remain
- <i>FILE</i> <i>*</i>
- variables and not C++
- <i>streams.</i>
- <p>
- You can also use
- <i>flex</i>
- to generate a C++ scanner class, using the
- <b>-+</b>
- option (or, equivalently,
- <b>%option</b> <b>c++),</b>
- which is automatically specified if the name of the flex
- executable ends in a '+', such as
- <i>flex++.</i>
- When using this option, flex defaults to generating the scanner to the file
- <b>lex.yy.cc</b>
- instead of
- <b>lex.yy.c.</b>
- The generated scanner includes the header file
- <i>FlexLexer.h,</i>
- which defines the interface to two C++ classes.
- <p>
- The first class,
- <b>FlexLexer,</b>
- provides an abstract base class defining the general scanner class
- interface. It provides the following member functions:
- <p><dl compact><dt><b>const</b> <b>char*</b> <b>YYText()</b>
- <dd>returns the text of the most recently matched token, the equivalent of
- <b>yytext.</b>
- <dt><b>int</b> <b>YYLeng()</b>
- <dd>returns the length of the most recently matched token, the equivalent of
- <b>yyleng.</b>
- <dt><b>int</b> <b>lineno()</b> <b>const</b>
- <dd>returns the current input line number
- (see
- <b>%option</b> <b>yylineno),</b>
- or
- <b>1</b>
- if
- <b>%option</b> <b>yylineno</b>
- was not used.
- <dt><b>void</b> <b>set_debug(</b> <b>int</b> <b>flag</b> <b>)</b>
- <dd>sets the debugging flag for the scanner, equivalent to assigning to
- <b>yy_flex_debug</b>
- (see the Options section above). Note that you must build the scanner
- using
- <b>%option</b> <b>debug</b>
- to include debugging information in it.
- <dt><b>int</b> <b>debug()</b> <b>const</b>
- <dd>returns the current setting of the debugging flag.
- </dl>
- <p>
- Also provided are member functions equivalent to
- <b>yy_switch_to_buffer(),</b>
- <b>yy_create_buffer()</b>
- (though the first argument is an
- <b>istream*</b>
- object pointer and not a
- <b>FILE*),</b>
- <b>yy_flush_buffer(),</b>
- <b>yy_delete_buffer(),</b>
- and
- <b>yyrestart()</b>
- (again, the first argument is a
- <b>istream*</b>
- object pointer).
- <p>
- The second class defined in
- <i>FlexLexer.h</i>
- is
- <b>yyFlexLexer,</b>
- which is derived from
- <b>FlexLexer.</b>
- It defines the following additional member functions:
- <p><dl compact><dt><b>yyFlexLexer( istream* arg_yyin = 0, ostream* arg_yyout = 0 )
- </b><dd>constructs a
- <b>yyFlexLexer</b>
- object using the given streams for input and output. If not specified,
- the streams default to
- <b>cin</b>
- and
- <b>cout,</b>
- respectively.
- <dt><b>virtual</b> <b>int</b> <b>yylex()</b>
- <dd>performs the same role is
- <b>yylex()</b>
- does for ordinary flex scanners: it scans the input stream, consuming
- tokens, until a rule's action returns a value. If you derive a subclass
- <b>S</b>
- from
- <b>yyFlexLexer</b>
- and want to access the member functions and variables of
- <b>S</b>
- inside
- <b>yylex(),</b>
- then you need to use
- <b>%option</b> <b>yyclass=S</b>
- to inform
- <i>flex</i>
- that you will be using that subclass instead of
- <b>yyFlexLexer.</b>
- In this case, rather than generating
- <b>yyFlexLexer::yylex(),</b>
- <i>flex</i>
- generates
- <b>S::yylex()</b>
- (and also generates a dummy
- <b>yyFlexLexer::yylex()</b>
- that calls
- <b>yyFlexLexer::LexerError()</b>
- if called).
- <dt><b>virtual void switch_streams(istream* new_in = 0,
- </b><b><dd>ostream* new_out = 0)
- </b>reassigns
- <b>yyin</b>
- to
- <b>new_in</b>
- (if non-nil)
- and
- <b>yyout</b>
- to
- <b>new_out</b>
- (ditto), deleting the previous input buffer if
- <b>yyin</b>
- is reassigned.
- <dt><b>int yylex( istream* new_in = 0, ostream* new_out = 0 )
- </b><dd>first switches the input streams via
- <b>switch_streams(</b> <b>new_in,</b> <b>new_out</b> <b>)</b>
- and then returns the value of
- <b>yylex().</b>
- </dl>
- <p>
- In addition,
- <b>yyFlexLexer</b>
- defines the following protected virtual functions which you can redefine
- in derived classes to tailor the scanner:
- <p><dl compact><dt><b>virtual int LexerInput( char* buf, int max_size )
- </b><dd>reads up to
- <b>max_size</b>
- characters into
- <b>buf</b>
- and returns the number of characters read. To indicate end-of-input,
- return 0 characters. Note that "interactive" scanners (see the
- <b>-B</b>
- and
- <b>-I</b>
- flags) define the macro
- <b>YY_INTERACTIVE.</b>
- If you redefine
- <b>LexerInput()</b>
- and need to take different actions depending on whether or not
- the scanner might be scanning an interactive input source, you can
- test for the presence of this name via
- <b>#ifdef.</b>
- <dt><b>virtual void LexerOutput( const char* buf, int size )
- </b><dd>writes out
- <b>size</b>
- characters from the buffer
- <b>buf,</b>
- which, while NUL-terminated, may also contain "internal" NUL's if
- the scanner's rules can match text with NUL's in them.
- <dt><b>virtual void LexerError( const char* msg )
- </b><dd>reports a fatal error message. The default version of this function
- writes the message to the stream
- <b>cerr</b>
- and exits.
- </dl>
- <p>
- Note that a
- <b>yyFlexLexer</b>
- object contains its
- <i>entire</i>
- scanning state. Thus you can use such objects to create reentrant
- scanners. You can instantiate multiple instances of the same
- <b>yyFlexLexer</b>
- class, and you can also combine multiple C++ scanner classes together
- in the same program using the
- <b>-P</b>
- option discussed above.
- <p>
- Finally, note that the
- <b>%array</b>
- feature is not available to C++ scanner classes; you must use
- <b>%pointer</b>
- (the default).
- <p>
- Here is an example of a simple C++ scanner:
- <pre>
- <p><br> // An example of using the flex C++ scanner class.
- <br>
- <p><br> %{
- <br> int mylineno = 0;
- <br> %}
- <br>
- <p><br> string \"[^\n"]+\"
- <br>
- <p><br> ws [ \t]+
- <br>
- <p><br> alpha [A-Za-z]
- <br> dig [0-9]
- <br> name ({alpha}|{dig}|\$)({alpha}|{dig}|[_.\-/$])*
- <br> num1 [-+]?{dig}+\.?([eE][-+]?{dig}+)?
- <br> num2 [-+]?{dig}*\.{dig}+([eE][-+]?{dig}+)?
- <br> number {num1}|{num2}
- <br>
- <p><br> %%
- <br>
- <p><br> {ws} /* skip blanks and tabs */
- <br>
- <p><br> "/*" {
- <br> int c;
- <br>
- <p><br> while((c = yyinput()) != 0)
- <br> {
- <br> if(c == '\n')
- <br> ++mylineno;
- <br>
- <p><br> else if(c == '*')
- <br> {
- <br> if((c = yyinput()) == '/')
- <br> break;
- <br> else
- <br> unput(c);
- <br> }
- <br> }
- <br> }
- <br>
- <p><br> {number} cout << "number " << YYText() << '\n';
- <br>
- <p><br> \n mylineno++;
- <br>
- <p><br> {name} cout << "name " << YYText() << '\n';
- <br>
- <p><br> {string} cout << "string " << YYText() << '\n';
- <br>
- <p><br> %%
- <br>
- <p><br> int main( int /* argc */, char** /* argv */ )
- <br> {
- <br> FlexLexer* lexer = new yyFlexLexer;
- <br> while(lexer->yylex() != 0)
- <br> ;
- <br> return 0;
- <br> }
- <br></pre>
- If you want to create multiple (different) lexer classes, you use the
- <b>-P</b>
- flag (or the
- <b>prefix=</b>
- option) to rename each
- <b>yyFlexLexer</b>
- to some other
- <b>xxFlexLexer.</b>
- You then can include
- <b><FlexLexer.h></b>
- in your other sources once per lexer class, first renaming
- <b>yyFlexLexer</b>
- as follows:
- <pre>
- <p><br> #undef yyFlexLexer
- <br> #define yyFlexLexer xxFlexLexer
- <br> #include <FlexLexer.h>
- <br>
- <p><br> #undef yyFlexLexer
- <br> #define yyFlexLexer zzFlexLexer
- <br> #include <FlexLexer.h>
- <br>
- <p><br></pre>
- if, for example, you used
- <b>%option</b> <b>prefix=xx</b>
- for one of your scanners and
- <b>%option</b> <b>prefix=zz</b>
- for the other.
- <p>
- IMPORTANT: the present form of the scanning class is
- <i>experimental</i>
- and may change considerably between major releases.
- </ul><H2>INCOMPATIBILITIES WITH LEX AND POSIX </H2><ul>
- <i>flex</i>
- is a rewrite of the AT&T Unix
- <i>lex</i>
- tool (the two implementations do not share any code, though),
- with some extensions and incompatibilities, both of which
- are of concern to those who wish to write scanners acceptable
- to either implementation. Flex is fully compliant with the POSIX
- <i>lex</i>
- specification, except that when using
- <b>%pointer</b>
- (the default), a call to
- <b>unput()</b>
- destroys the contents of
- <b>yytext,</b>
- which is counter to the POSIX specification.
- <p>
- In this section we discuss all of the known areas of incompatibility
- between flex, AT&T lex, and the POSIX specification.
- <p>
- <i>flex's</i>
- <b>-l</b>
- option turns on maximum compatibility with the original AT&T
- <i>lex</i>
- implementation, at the cost of a major loss in the generated scanner's
- performance. We note below which incompatibilities can be overcome
- using the
- <b>-l</b>
- option.
- <p>
- <i>flex</i>
- is fully compatible with
- <i>lex</i>
- with the following exceptions:
- <p><dl compact><dt>-<dd>The undocumented
- <i>lex</i>
- scanner internal variable
- <b>yylineno</b>
- is not supported unless
- <b>-l</b>
- or
- <b>%option</b> <b>yylineno</b>
- is used.
- <dt><dd><b>yylineno</b>
- should be maintained on a per-buffer basis, rather than a per-scanner
- (single global variable) basis.
- <dt><dd><b>yylineno</b>
- is not part of the POSIX specification.
- <dt>-<dd>The
- <b>input()</b>
- routine is not redefinable, though it may be called to read characters
- following whatever has been matched by a rule. If
- <b>input()</b>
- encounters an end-of-file the normal
- <b>yywrap()</b>
- processing is done. A ``real'' end-of-file is returned by
- <b>input()</b>
- as
- <i>EOF.</i>
- <dt><dd>Input is instead controlled by defining the
- <b>YY_INPUT</b>
- macro.
- <dt><dd>The
- <i>flex</i>
- restriction that
- <b>input()</b>
- cannot be redefined is in accordance with the POSIX specification,
- which simply does not specify any way of controlling the
- scanner's input other than by making an initial assignment to
- <i>yyin.</i>
- <dt>-<dd>The
- <b>unput()</b>
- routine is not redefinable. This restriction is in accordance with POSIX.
- <dt>-<dd><i>flex</i>
- scanners are not as reentrant as
- <i>lex</i>
- scanners. In particular, if you have an interactive scanner and
- an interrupt handler which long-jumps out of the scanner, and
- the scanner is subsequently called again, you may get the following
- message:
- <pre>
- <p><br> fatal flex scanner internal error--end of buffer missed
- <br>
- <p><br></pre>
- To reenter the scanner, first use
- <pre>
- <p><br> yyrestart( yyin );
- <br>
- <p><br></pre>
- Note that this call will throw away any buffered input; usually this
- isn't a problem with an interactive scanner.
- <dt><dd>Also note that flex C++ scanner classes
- <i>are</i>
- reentrant, so if using C++ is an option for you, you should use
- them instead. See "Generating C++ Scanners" above for details.
- <dt>-<dd><b>output()</b>
- is not supported.
- Output from the
- <b>ECHO</b>
- macro is done to the file-pointer
- <i>yyout</i>
- (default
- <i>stdout).</i>
- <dt><dd><b>output()</b>
- is not part of the POSIX specification.
- <dt>-<dd><i>lex</i>
- does not support exclusive start conditions (%x), though they
- are in the POSIX specification.
- <dt>-<dd>When definitions are expanded,
- <i>flex</i>
- encloses them in parentheses.
- With lex, the following:
- <pre>
- <p><br> NAME [A-Z][A-Z0-9]*
- <br> %%
- <br> foo{NAME}? printf( "Found it\n" );
- <br> %%
- <br>
- <p><br></pre>
- will not match the string "foo" because when the macro
- is expanded the rule is equivalent to "foo[A-Z][A-Z0-9]*?"
- and the precedence is such that the '?' is associated with
- "[A-Z0-9]*". With
- <i>flex,</i>
- the rule will be expanded to
- "foo([A-Z][A-Z0-9]*)?" and so the string "foo" will match.
- <dt><dd>Note that if the definition begins with
- <b>^</b>
- or ends with
- <b>$</b>
- then it is
- <i>not</i>
- expanded with parentheses, to allow these operators to appear in
- definitions without losing their special meanings. But the
- <b><s>,</b> <b>/,</b>
- and
- <b><<EOF>></b>
- operators cannot be used in a
- <i>flex</i>
- definition.
- <dt><dd>Using
- <b>-l</b>
- results in the
- <i>lex</i>
- behavior of no parentheses around the definition.
- <dt><dd>The POSIX specification is that the definition be enclosed in parentheses.
- <dt>-<dd>Some implementations of
- <i>lex</i>
- allow a rule's action to begin on a separate line, if the rule's pattern
- has trailing whitespace:
- <pre>
- <p><br> %%
- <br> foo|bar<space here>
- <br> { foobar_action(); }
- <br>
- <p><br></pre>
- <i>flex</i>
- does not support this feature.
- <dt>-<dd>The
- <i>lex</i>
- <b>%r</b>
- (generate a Ratfor scanner) option is not supported. It is not part
- of the POSIX specification.
- <dt>-<dd>After a call to
- <b>unput(),</b>
- <i>yytext</i>
- is undefined until the next token is matched, unless the scanner
- was built using
- <b>%array.</b>
- This is not the case with
- <i>lex</i>
- or the POSIX specification. The
- <b>-l</b>
- option does away with this incompatibility.
- <dt>-<dd>The precedence of the
- <b>{}</b>
- (numeric range) operator is different.
- <i>lex</i>
- interprets "abc{1,3}" as "match one, two, or
- three occurrences of 'abc'", whereas
- <i>flex</i>
- interprets it as "match 'ab'
- followed by one, two, or three occurrences of 'c'". The latter is
- in agreement with the POSIX specification.
- <dt>-<dd>The precedence of the
- <b>^</b>
- operator is different.
- <i>lex</i>
- interprets "^foo|bar" as "match either 'foo' at the beginning of a line,
- or 'bar' anywhere", whereas
- <i>flex</i>
- interprets it as "match either 'foo' or 'bar' if they come at the beginning
- of a line". The latter is in agreement with the POSIX specification.
- <dt>-<dd>The special table-size declarations such as
- <b>%a</b>
- supported by
- <i>lex</i>
- are not required by
- <i>flex</i>
- scanners;
- <i>flex</i>
- ignores them.
- <dt>-<dd>The name
- FLEX_SCANNER
- is #define'd so scanners may be written for use with either
- <i>flex</i>
- or
- <i>lex.</i>
- Scanners also include
- <b>YY_FLEX_MAJOR_VERSION</b>
- and
- <b>YY_FLEX_MINOR_VERSION</b>
- indicating which version of
- <i>flex</i>
- generated the scanner
- (for example, for the 2.5 release, these defines would be 2 and 5
- respectively).
- </dl>
- <p>
- The following
- <i>flex</i>
- features are not included in
- <i>lex</i>
- or the POSIX specification:
- <pre>
- <p><br> C++ scanners
- <br> %option
- <br> start condition scopes
- <br> start condition stacks
- <br> interactive/non-interactive scanners
- <br> yy_scan_string() and friends
- <br> yyterminate()
- <br> yy_set_interactive()
- <br> yy_set_bol()
- <br> YY_AT_BOL()
- <br> <<EOF>>
- <br> <*>
- <br> YY_DECL
- <br> YY_START
- <br> YY_USER_ACTION
- <br> YY_USER_INIT
- <br> #line directives
- <br> %{}'s around actions
- <br> multiple actions on a line
- <br>
- <p><br></pre>
- plus almost all of the flex flags.
- The last feature in the list refers to the fact that with
- <i>flex</i>
- you can put multiple actions on the same line, separated with
- semi-colons, while with
- <i>lex,</i>
- the following
- <pre>
- <p><br> foo handle_foo(); ++num_foos_seen;
- <br>
- <p><br></pre>
- is (rather surprisingly) truncated to
- <pre>
- <p><br> foo handle_foo();
- <br>
- <p><br></pre>
- <i>flex</i>
- does not truncate the action. Actions that are not enclosed in
- braces are simply terminated at the end of the line.
- </ul><H2>DIAGNOSTICS </H2><ul>
- <p>
- <i>warning,</i> <i>rule</i> <i>cannot</i> <i>be</i> <i>matched</i>
- indicates that the given rule
- cannot be matched because it follows other rules that will
- always match the same text as it. For
- example, in the following "foo" cannot be matched because it comes after
- an identifier "catch-all" rule:
- <pre>
- <p><br> [a-z]+ got_identifier();
- <br> foo got_foo();
- <br>
- <p><br></pre>
- Using
- <b>REJECT</b>
- in a scanner suppresses this warning.
- <p>
- <i>warning,</i>
- <b>-s</b>
- <i>option given but default rule can be matched
- </i>means that it is possible (perhaps only in a particular start condition)
- that the default rule (match any single character) is the only one
- that will match a particular input. Since
- <b>-s</b>
- was given, presumably this is not intended.
- <p>
- <i>reject_used_but_not_detected</i> <i>undefined</i>
- or
- <i>yymore_used_but_not_detected</i> <i>undefined</i> <i>-</i>
- These errors can occur at compile time. They indicate that the
- scanner uses
- <b>REJECT</b>
- or
- <b>yymore()</b>
- but that
- <i>flex</i>
- failed to notice the fact, meaning that
- <i>flex</i>
- scanned the first two sections looking for occurrences of these actions
- and failed to find any, but somehow you snuck some in (via a #include
- file, for example). Use
- <b>%option</b> <b>reject</b>
- or
- <b>%option</b> <b>yymore</b>
- to indicate to flex that you really do use these features.
- <p>
- <i>flex</i> <i>scanner</i> <i>jammed</i> <i>-</i>
- a scanner compiled with
- <b>-s</b>
- has encountered an input string which wasn't matched by
- any of its rules. This error can also occur due to internal problems.
- <p>
- <i>token</i> <i>too</i> <i>large,</i> <i>exceeds</i> <i>YYLMAX</i> <i>-</i>
- your scanner uses
- <b>%array</b>
- and one of its rules matched a string longer than the
- <b>YYLMAX</b>
- constant (8K bytes by default). You can increase the value by
- #define'ing
- <b>YYLMAX</b>
- in the definitions section of your
- <i>flex</i>
- input.
- <p>
- <i>scanner</i> <i>requires</i> <i>-8</i> <i>flag</i> <i>to</i>
- <i>use</i> <i>the</i> <i>character</i> <i>'x'</i> <i>-</i>
- Your scanner specification includes recognizing the 8-bit character
- <i>'x'</i>
- and you did not specify the -8 flag, and your scanner defaulted to 7-bit
- because you used the
- <b>-Cf</b>
- or
- <b>-CF</b>
- table compression options. See the discussion of the
- <b>-7</b>
- flag for details.
- <p>
- <i>flex</i> <i>scanner</i> <i>push-back</i> <i>overflow</i> <i>-</i>
- you used
- <b>unput()</b>
- to push back so much text that the scanner's buffer could not hold
- both the pushed-back text and the current token in
- <b>yytext.</b>
- Ideally the scanner should dynamically resize the buffer in this case, but at
- present it does not.
- <p>
- <i>input buffer overflow, can't enlarge buffer because scanner uses REJECT -
- </i>the scanner was working on matching an extremely large token and needed
- to expand the input buffer. This doesn't work with scanners that use
- <b>REJECT.
- </b>
- <p>
- <i>fatal flex scanner internal error--end of buffer missed -
- </i>This can occur in an scanner which is reentered after a long-jump
- has jumped out (or over) the scanner's activation frame. Before
- reentering the scanner, use:
- <pre>
- <p><br> yyrestart( yyin );
- <br>
- <p><br></pre>
- or, as noted above, switch to using the C++ scanner class.
- <p>
- <i>too</i> <i>many</i> <i>start</i> <i>conditions</i> <i>in</i> <i><></i> <i>construct!</i> <i>-</i>
- you listed more start conditions in a <> construct than exist (so
- you must have listed at least one of them twice).
- </ul><H2>FILES </H2><ul>
- <p><dl compact><dt><b>-lfl</b>
- <dd>library with which scanners must be linked.
- <dt><i>lex.yy.c</i>
- <dd>generated scanner (called
- <i>lexyy.c</i>
- on some systems).
- <dt><i>lex.yy.cc</i>
- <dd>generated C++ scanner class, when using
- <b>-+.</b>
- <dt><i><FlexLexer.h></i>
- <dd>header file defining the C++ scanner base class,
- <b>FlexLexer,</b>
- and its derived class,
- <b>yyFlexLexer.</b>
- <dt><i>flex.skl</i>
- <dd>skeleton scanner. This file is only used when building flex, not when
- flex executes.
- <dt><i>lex.backup</i>
- <dd>backing-up information for
- <b>-b</b>
- flag (called
- <i>lex.bck</i>
- on some systems).
- </dl>
- </ul><H2>DEFICIENCIES / BUGS </H2><ul>
- <p>
- Some trailing context
- patterns cannot be properly matched and generate
- warning messages ("dangerous trailing context"). These are
- patterns where the ending of the
- first part of the rule matches the beginning of the second
- part, such as "zx*/xy*", where the 'x*' matches the 'x' at
- the beginning of the trailing context. (Note that the POSIX draft
- states that the text matched by such patterns is undefined.)
- <p>
- For some trailing context rules, parts which are actually fixed-length are
- not recognized as such, leading to the abovementioned performance loss.
- In particular, parts using '|' or {n} (such as "foo{3}") are always
- considered variable-length.
- <p>
- Combining trailing context with the special '|' action can result in
- <i>fixed</i>
- trailing context being turned into the more expensive
- <i>variable</i>
- trailing context. For example, in the following:
- <pre>
- <p><br> %%
- <br> abc |
- <br> xyz/def
- <br>
- <p><br></pre>
- <p>
- Use of
- <b>unput()</b>
- invalidates yytext and yyleng, unless the
- <b>%array</b>
- directive
- or the
- <b>-l</b>
- option has been used.
- <p>
- Pattern-matching of NUL's is substantially slower than matching other
- characters.
- <p>
- Dynamic resizing of the input buffer is slow, as it entails rescanning
- all the text matched so far by the current (generally huge) token.
- <p>
- Due to both buffering of input and read-ahead, you cannot intermix
- calls to <stdio.h> routines, such as, for example,
- <b>getchar(),</b>
- with
- <i>flex</i>
- rules and expect it to work. Call
- <b>input()</b>
- instead.
- <p>
- The total table entries listed by the
- <b>-v</b>
- flag excludes the number of table entries needed to determine
- what rule has been matched. The number of entries is equal
- to the number of DFA states if the scanner does not use
- <b>REJECT,</b>
- and somewhat greater than the number of states if it does.
- <p>
- <b>REJECT</b>
- cannot be used with the
- <b>-f</b>
- or
- <b>-F</b>
- options.
- <p>
- The
- <i>flex</i>
- internal algorithms need documentation.
- </ul><H2>SEE ALSO </H2><ul>
- <p>
- lex(1), yacc(1), sed(1), awk(1).
- <p>
- John Levine, Tony Mason, and Doug Brown,
- <i>Lex</i> <i>&</i> <i>Yacc,</i>
- O'Reilly and Associates. Be sure to get the 2nd edition.
- <p>
- M. E. Lesk and E. Schmidt,
- <i>LEX</i> <i>-</i> <i>Lexical</i> <i>Analyzer</i> <i>Generator</i>
- <p>
- Alfred Aho, Ravi Sethi and Jeffrey Ullman,
- <i>Compilers:</i> <i>Principles,</i> <i>Techniques</i> <i>and</i> <i>Tools,</i>
- Addison-Wesley (1986). Describes the pattern-matching techniques used by
- <i>flex</i>
- (deterministic finite automata).
- </ul><H2>AUTHOR </H2><ul>
- Vern Paxson, with the help of many ideas and much inspiration from
- Van Jacobson. Original version by Jef Poskanzer. The fast table
- representation is a partial implementation of a design done by Van
- Jacobson. The implementation was done by Kevin Gong and Vern Paxson.
- <p>
- Thanks to the many
- <i>flex</i>
- beta-testers, feedbackers, and contributors, especially Francois Pinard,
- Casey Leedom,
- Stan Adermann, Terry Allen, David Barker-Plummer, John Basrai,
- Nelson H.F. Beebe, [email protected],
- Karl Berry, Peter A. Bigot, Simon Blanchard,
- Keith Bostic, Frederic Brehm, Ian Brockbank, Kin Cho, Nick Christopher,
- Brian Clapper, J.T. Conklin,
- Jason Coughlin, Bill Cox, Nick Cropper, Dave Curtis, Scott David
- Daniels, Chris G. Demetriou, Theo Deraadt,
- Mike Donahue, Chuck Doucette, Tom Epperly, Leo Eskin,
- Chris Faylor, Chris Flatters, Jon Forrest, Joe Gayda, Kaveh R. Ghazi,
- Eric Goldman, Christopher M. Gould, Ulrich Grepel, Peer Griebel,
- Jan Hajic, Charles Hemphill, NORO Hideo,
- Jarkko Hietaniemi, Scott Hofmann,
- Jeff Honig, Dana Hudes, Eric Hughes, John Interrante,
- Ceriel Jacobs, Michal Jaegermann, Sakari Jalovaara, Jeffrey R. Jones,
- Henry Juengst, Klaus Kaempf, Jonathan I. Kamens, Terrence O Kane,
- Amir Katz, [email protected], Kevin B. Kenny,
- Steve Kirsch, Winfried Koenig, Marq Kole, Ronald Lamprecht,
- Greg Lee, Rohan Lenard, Craig Leres, John Levine, Steve Liddle, Mike Long,
- Mohamed el Lozy, Brian Madsen, Malte, Joe Marshall,
- Bengt Martensson, Chris Metcalf,
- Luke Mewburn, Jim Meyering, R. Alexander Milowski, Erik Naggum,
- G.T. Nicol, Landon Noll, James Nordby, Marc Nozell,
- Richard Ohnemus, Karsten Pahnke,
- Sven Panne, Roland Pesch, Walter Pelissero, Gaumond
- Pierre, Esmond Pitt, Jef Poskanzer, Joe Rahmeh, Jarmo Raiha,
- Frederic Raimbault, Pat Rankin, Rick Richardson,
- Kevin Rodgers, Kai Uwe Rommel, Jim Roskind, Alberto Santini,
- Andreas Scherer, Darrell Schiebel, Raf Schietekat,
- Doug Schmidt, Philippe Schnoebelen, Andreas Schwab,
- Alex Siegel, Eckehard Stolz, Jan-Erik Strvmquist,
- Mike Stump, Paul Stuart, Dave Tallman, Ian Lance Taylor,
- Chris Thewalt, Richard M. Timoney, Jodi Tsai,
- Paul Tuinenga, Gary Weik, Frank Whaley, Gerhard Wilhelms, Kent Williams, Ken
- Yap, Ron Zellar, Nathan Zelle, David Zuhn,
- and those whose names have slipped my marginal
- mail-archiving skills but whose contributions are appreciated all the
- same.
- <p>
- Thanks to Keith Bostic, Jon Forrest, Noah Friedman,
- John Gilmore, Craig Leres, John Levine, Bob Mulcahy, G.T.
- Nicol, Francois Pinard, Rich Salz, and Richard Stallman for help with various
- distribution headaches.
- <p>
- Thanks to Esmond Pitt and Earle Horton for 8-bit character support; to
- Benson Margulies and Fred Burke for C++ support; to Kent Williams and Tom
- Epperly for C++ class support; to Ove Ewerlid for support of NUL's; and to
- Eric Hughes for support of multiple buffers.
- <p>
- This work was primarily done when I was with the Real Time Systems Group
- at the Lawrence Berkeley Laboratory in Berkeley, CA. Many thanks to all there
- for the support I received.
- <p>
- Send comments to [email protected].
- </ul></body></html>
|