Browse Source

Lua : Regex support!

This one gets an exclamation mark, since I was happy to get it
working... Buuuuut... It carries some caveats.

The Haxe code here supports full-blown PCRE-based regex
operations inlua.  *HOWEVER*, it provides this functionality via a third party
library called lrexlib: http://rrthomas.github.io/lrexlib/

Lrexlib is compiled, meaning that Haxe lua users will need to compile
and install the extension themselves on their target in order to use
this base Haxe class.  However, I think this is the best approach given:

1.  Lua doesn't come with regex "out of the box".  Instead, it has a
watered down version that will not be easy to adjust to between targets.
2.  It is easy to compile/install third party libs like lrexlib via the
"luarocks" package manager.
3.  The compiled "so" output is very small, and gives the regex library
the same speed as most other implementations (e.g. js, etc.).

It's entirely possible to swap out the "Rex" extern here for another
regex library (Lpeg, Onigoruna, or one of the even newer ones).
However, I'm guessing most folks are going to be comfortable with PCRE.
If not, I'm hoping they can give a pull request for their library of
choice :)
Justin Donaldson 10 years ago
parent
commit
cb4b22869d
3 changed files with 107 additions and 43 deletions
  1. 54 0
      std/lua/Rex.hx
  2. 0 0
      std/lua/RexTools.hx
  3. 53 43
      std/lua/_std/EReg.hx

+ 54 - 0
std/lua/Rex.hx

@@ -0,0 +1,54 @@
+package lua;
+@:luaRequire("rex_pcre")
+extern class Rex {
+	public function new(expr : String, flag : String);
+
+	/**
+	  The function searches for the first match of the regexp patt in the
+	  string subj, starting from offset init, subject to flags cf and ef.
+	  Returns matched string, or array of strings.
+	 **/
+	public function match(patt : String, ?init : Int, ?ef : Int) : Dynamic;
+
+	/**
+	 The function searches for the first match of the regexp patt in the string
+	 subj, starting from offset init, subject to flags cf and ef. 
+	 Returns 
+	 **/
+	public function find(subj : String, ?init : Int, ?ef : Int) : Dynamic;
+
+
+	/**
+	 The function is intended for use in the generic for Lua construct. It is
+	 used for splitting a subject string subj into parts (sections). The sep
+	 parameter is a regular expression pattern representing separators between
+	 the sections. 
+	 **/
+	public function split(subj : String, sep : String, cf : Int, ef : Int) : lua.Table<Int,String>;
+
+
+	/**
+	  This function counts matches of the pattern patt in the string subj.
+	**/	
+	public function count(subj : String, patt : String, cf : Int, ef : Int) : Dynamic;
+
+	public function flags(tb:Dynamic) : Dynamic;
+
+	public function tfind(subj : String, ?init : Int, ?ef : Int) : Dynamic;
+
+	public function exec(subj : String, ?init : Int, ?ef : Int) : Dynamic;
+
+	/**
+	 The function is intended for use in the generic for Lua construct. It
+	 returns an iterator for repeated matching of the pattern patt in the
+	 string subj, subject to flags cf and ef.
+	 **/
+	public static function gmatch(subj : String, ?cf : Int, ?ef : Int) : lua.Iterator;
+
+	/**
+	  This function searches for all matches of the pattern patt in the string subj
+	  and replaces them according to the parameters repl and n.
+	 **/
+	public static function gsub(subj : String, patt : String, repl: Dynamic, ?n: Int, ?cf : Int, ?ef : Int) : String;
+}
+

+ 0 - 0
std/lua/RexTools.hx


+ 53 - 43
std/lua/_std/EReg.hx

@@ -19,60 +19,69 @@
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
  * DEALINGS IN THE SOFTWARE.
  */
-@:coreApi class EReg {
+import lua.Rex;
+import lua.Table;
+import lua.Boot;
+import lua.TableTools;
+// @:coreApi
+class EReg {
 
-	var r : HaxeRegExp;
+	var r : Rex; // the Rex extern instance.
+	var global : Bool;  // whether the regex is in global mode.
+	var s : String; // the last matched string
+	var m : Table<Int,Dynamic>; // the [start:Int, end:Int, and submatches:String (matched groups)] as a single table.
 
 	public function new( r : String, opt : String ) : Void {
-		opt = opt.split("u").join(""); // 'u' (utf8) depends on page encoding
-		this.r = new HaxeRegExp(r, opt);
+		var ropt = new StringBuf();
+		for (i in 0...opt.length){
+			switch(opt.charAt(i)){
+				case "i", "m", "s" : ropt.add(opt.charAt(i));
+				case "g" : global = true;
+				default : null;
+			}
+		}
+		if (global == null) global = false;
+		this.r = new Rex(r, ropt.toString());
 	}
 
-	public function match( s : String ) : Bool {
-		if( r.global ) r.lastIndex = 0;
-		r.m = r.exec(s);
-		r.s = s;
-		return (r.m != null);
+	public function match( str : String ) : Bool {
+		m = untyped Boot.unpack(r.exec(str));
+		s = str;
+		return m[0] != null;
 	}
 
 	public function matched( n : Int ) : String {
-		return if( r.m != null && n >= 0 && n < r.m.length ) r.m[n] else throw "EReg::matched";
+		if (m == null || n < 0) throw "EReg::matched";
+		else if (n == 0) {
+			// TODO: Figure out how to use lua 1-based indexing where appropriate,
+			// 	while also providing the lua.Table utility abstract.
+			return untyped __lua__("string.sub(self.s, self.m[1], self.m[2])");
+		} else {
+			var mn = 2 * (n - 1);
+			return untyped __lua__("string.sub(self.s, self.m[3][mn + 1], self.m[3][mn + 2])");
+		}
 	}
 
 	public function matchedLeft() : String {
-		if( r.m == null ) throw "No string matched";
-		return r.s.substr(0,r.m.index);
+		if( m == null ) throw "No string matched";
+		return untyped __lua__("string.sub(self.s, 1, self.m[1]-1)");
 	}
 
 	public function matchedRight() : String {
-		if( r.m == null ) throw "No string matched";
-		var sz = r.m.index+r.m[0].length;
-		return r.s.substr(sz,r.s.length-sz);
+		if( m == null ) throw "No string matched";
+		return untyped __lua__("string.sub(self.s, self.m[2]+1)");
 	}
 
 	public function matchedPos() : { pos : Int, len : Int } {
-		if( r.m == null ) throw "No string matched";
-		return { pos : r.m.index, len : r.m[0].length };
+		if( m == null ) throw "No string matched";
+		return {
+			pos : m[0]-1,
+			len : m[1]- m[0]+ 1
+		}
 	}
 
-	public function matchSub( s : String, pos : Int, len : Int = -1):Bool {
-		return if (r.global) {
-			r.lastIndex = pos;
-			r.m = r.exec(len < 0 ? s : s.substr(0, pos + len));
-			var b = r.m != null;
-			if (b) {
-				r.s = s;
-			}
-			b;
-		} else {
-			// TODO: check some ^/$ related corner cases
-			var b = match( len < 0 ? s.substr(pos) : s.substr(pos,len) );
-			if (b) {
-				r.s = s;
-				r.m.index += pos;
-			}
-			b;
-		}
+	public inline function matchSub( s : String, pos : Int, ?len : Int):Bool {
+		return match(s.substr(pos, len));
 	}
 
 	public function split( s : String ) : Array<String> {
@@ -82,7 +91,12 @@
 	}
 
 	public function replace( s : String, by : String ) : String {
-		return untyped s.replace(r,by);
+		if (global){
+			return split(s).join(by);
+		} else {
+			if (match(s)) return matchedLeft() + by + matchedRight();
+			else return s;
+		}
 	}
 
 	public function map( s : String, f : EReg -> String ) : String {
@@ -104,15 +118,11 @@
 			}
 			else
 				offset = p.pos + p.len;
-		} while (r.global);
-		if (!r.global && offset > 0 && offset < s.length)
+		} while (global);
+		if (!global && offset > 0 && offset < s.length)
 			buf.add(s.substr(offset));
 		return buf.toString();
 	}
-}
 
-@:native("RegExp")
-private extern class HaxeRegExp extends js.RegExp {
-	var m:js.RegExp.RegExpMatch;
-	var s:String;
 }
+