Browse Source

--- Merging r20685 into '.':
U rtl/arm/arm.inc
--- Merging r21648 into '.':
U rtl/arm/strings.inc
--- Merging r21649 into '.':
G rtl/arm/arm.inc
U rtl/arm/divide.inc
--- Merging r21760 into '.':
G rtl/arm/arm.inc
--- Merging r21952 into '.':
U rtl/arm/mathu.inc
--- Merging r21961 into '.':
G rtl/arm/mathu.inc

# revisions: 20685,21648,21649,21760,21952,21961
r20685 | florian | 2012-04-01 19:31:49 +0200 (Sun, 01 Apr 2012) | 30 lines
Changed paths:
M /trunk/rtl/arm/arm.inc

o patch by Nico Erfurth: Support Assembly optimized functions of SwapEndian on ARM

Currently the ARM-Port uses generic functions for SwapEndian, which are
relativly slow.

This patch adds optimized functions for the 32 and 64-bit case, the 16
bit case is still handled with a normal function, while the generated
code is far from optimal, the inlining (which is not possible with
asm-functions) makes it faster than the optimized function.

Some Numbers from my 1.2GHz Kirkwood (ARMv5):

Old New Result
SwapEndian(Integer) 12.168s 5.411s 44.47%
SwapEndian(Int64) 168.28s 9.015s 5.36%

Testcode was
begin
I := $FFFFFFF;
while I > 0 do
begin
Val2 := MySwapEndian(Val);
Dec(I);
end;
end.

Currently only the ARM implementation is tested. ARMv6+ includes a rev
instruction, while I've implemented them, I was not able to test them.
r21648 | masta | 2012-06-18 18:59:34 +0200 (Mon, 18 Jun 2012) | 3 lines
Changed paths:
M /trunk/rtl/arm/strings.inc

ARM assembly versions of strupper and strlower

This is about 1/3 faster than the generic code.
r21649 | masta | 2012-06-18 18:59:39 +0200 (Mon, 18 Jun 2012) | 4 lines
Changed paths:
M /trunk/rtl/arm/arm.inc
M /trunk/rtl/arm/divide.inc

Use bx lr in ARM-RTL for armv5

ARMv5 supports the BX instruction.
BX usually is better supported by Branch Prediction Units than mov pc,lr.
r21760 | masta | 2012-07-03 01:54:19 +0200 (Tue, 03 Jul 2012) | 14 lines
Changed paths:
M /trunk/rtl/arm/arm.inc

Small optimizations to FillChar for ARM

The new version is more optimized to the "common case"

We assume most of the data will be aligned, thats why the unaligned
case has been moved to the end of the function so the aligned case is
more cache- and pipeline friendly.

I've also reduced the loop unrolling for the block transfer loop,
because for large blocks we'll most likely hit the write buffer limit
anyway.

I've did some measurements. The new routine is a bit slower for less
than 8 bytes, but beats the old one by 10-15% with 8 bytes++
r21952 | masta | 2012-07-23 09:26:57 +0200 (Mon, 23 Jul 2012) | 15 lines
Changed paths:
M /trunk/rtl/arm/mathu.inc

Fix ARM FPU exception masks

This corrects the handling of exception masks and ARM VFP
implementations. The old code enable the exception when it was present
in the mask. So in fact it did the contrary of what it was supposed to
do.

VFP-Support is currently broken, this patch at least allows to build a
working VFP-native compiler. But the full build still breaks because of
some compiler options not properly beeing passed down to packages/ which
results in:

"Trying to use a unit which was compiled with a different FPU mode"

because somehow OPT="-Cfvfpv2" did not get passed down.
r21961 | masta | 2012-07-24 00:58:02 +0200 (Tue, 24 Jul 2012) | 6 lines
Changed paths:
M /trunk/rtl/arm/mathu.inc

Fix ARM FPU Exceptions for WinCE

r21952 introduced wrong code (through copy&waste) for the wince
exception-setup routines.

This patch hopefully fixes the code again.

git-svn-id: branches/fixes_2_6@22525 -

marco 13 years ago
parent
commit
c063a50cb6
4 changed files with 253 additions and 78 deletions
  1. 180 52
      rtl/arm/arm.inc
  2. 2 2
      rtl/arm/divide.inc
  3. 24 24
      rtl/arm/mathu.inc
  4. 47 0
      rtl/arm/strings.inc

+ 180 - 52
rtl/arm/arm.inc

@@ -138,62 +138,69 @@ end;
 Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
 Procedure FillChar(var x;count:longint;value:byte);assembler;nostackframe;
 asm
 asm
         // less than 0?
         // less than 0?
-        cmp r1,#0
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
-        movlt pc,lr
+        cmp     r1,#0
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
+        movle   pc,lr
 {$else}
 {$else}
-        bxlt  lr
+        bxle    lr
 {$endif}
 {$endif}
         mov     r3,r0
         mov     r3,r0
-        cmp     r1,#8           // at least 8 bytes to do?
-        blt     .LFillchar2
-        orr r2,r2,r2,lsl #8
-        orr r2,r2,r2,lsl #16
-.LFillchar0:
-        tst     r3,#3           // aligned yet?
-        strneb r2,[r3],#1
-        subne   r1,r1,#1
-        bne     .LFillchar0
+
+        orr     r2,r2,r2,lsl #8
+        orr     r2,r2,r2,lsl #16
+
+        tst     r3, #3  // Aligned?
+        bne     .LFillchar_do_align
+
+.LFillchar_is_aligned:
+        subs    r1,r1,#8
+        bmi     .LFillchar_less_than_8bytes
+
         mov     ip,r2
         mov     ip,r2
-.LFillchar1:
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
+.LFillchar_at_least_8bytes:
+        // Do 16 bytes per loop
+        // More unrolling is uncessary, as we'll just stall on the write buffers
         stmia   r3!,{r2,ip}
         stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        blt     .LFillchar2
-        stmia   r3!,{r2,ip}
-        sub     r1,r1,#8
-        cmp     r1,#8           // 8 bytes still to do?
-        stmgeia r3!,{r2,ip}
-        subge   r1,r1,#8
-        bge     .LFillchar1
-.LFillchar2:
-        movs r1,r1              // anything left?
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        subs    r1,r1,#8
+        stmplia r3!,{r2,ip}
+        subpls  r1,r1,#8
+        bpl     .LFillchar_at_least_8bytes
+
+.LFillchar_less_than_8bytes:
+        // Do the rest
+        adds    r1, r1, #8
+
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
         moveq   pc,lr
         moveq   pc,lr
 {$else}
 {$else}
         bxeq    lr
         bxeq    lr
 {$endif}
 {$endif}
-        rsb     r1,r1,#7
-        add     pc,pc,r1,lsl #2
-        mov     r0,r0
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-        strb r2,[r3],#1
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+
+        tst     r1, #4
+        strne   r2,[r3],#4
+        tst     r1, #2
+        strneh  r2,[r3],#2
+        tst     r1, #1
+        strneb  r2,[r3],#1
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
         mov pc,lr
         mov pc,lr
 {$else}
 {$else}
         bx  lr
         bx  lr
 {$endif}
 {$endif}
+
+// Special case for unaligned start
+// We make a maximum of 3 loops here
+.LFillchar_do_align:
+        strb r2,[r3],#1
+        subs r1, r1, #1
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
+        moveq pc,lr
+{$else}
+        bxeq  lr
+{$endif}
+        tst r3,#3
+        bne .LFillchar_do_align
+        b .LFillchar_is_aligned
 end;
 end;
 {$endif FPC_SYSTEM_HAS_FILLCHAR}
 {$endif FPC_SYSTEM_HAS_FILLCHAR}
 
 
@@ -204,7 +211,7 @@ asm
   pld [r0]
   pld [r0]
   // count <=0 ?
   // count <=0 ?
   cmp r2,#0
   cmp r2,#0
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   movle pc,lr
   movle pc,lr
 {$else}
 {$else}
   bxle  lr
   bxle  lr
@@ -221,7 +228,7 @@ asm
   ldrb r3,[r0,r2]
   ldrb r3,[r0,r2]
   strb r3,[r1,r2]
   strb r3,[r1,r2]
   bne .Loverlapped
   bne .Loverlapped
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   mov pc,lr
   mov pc,lr
 {$else}
 {$else}
   bx  lr
   bx  lr
@@ -266,7 +273,7 @@ asm
   str r3,[r1],#4
   str r3,[r1],#4
   bcs .Ldwordloop
   bcs .Ldwordloop
   cmp r2,#0
   cmp r2,#0
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   moveq pc,lr
   moveq pc,lr
 {$else}
 {$else}
   bxeq  lr
   bxeq  lr
@@ -276,7 +283,7 @@ asm
   ldrb r3,[r0],#1
   ldrb r3,[r0],#1
   strb r3,[r1],#1
   strb r3,[r1],#1
   bne .Lbyteloop
   bne .Lbyteloop
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   mov pc,lr
   mov pc,lr
 {$else}
 {$else}
   bx  lr
   bx  lr
@@ -287,7 +294,7 @@ procedure Move_blended(const source;var dest;count:longint);assembler;nostackfra
 asm
 asm
   // count <=0 ?
   // count <=0 ?
   cmp r2,#0
   cmp r2,#0
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   movle pc,lr
   movle pc,lr
 {$else}
 {$else}
   bxle  lr
   bxle  lr
@@ -304,7 +311,7 @@ asm
   ldrb r3,[r0,r2]
   ldrb r3,[r0,r2]
   strb r3,[r1,r2]
   strb r3,[r1,r2]
   bne .Loverlapped
   bne .Loverlapped
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   mov pc,lr
   mov pc,lr
 {$else}
 {$else}
   bx  lr
   bx  lr
@@ -346,7 +353,7 @@ asm
   str r3,[r1],#4
   str r3,[r1],#4
   bcs .Ldwordloop
   bcs .Ldwordloop
   cmp r2,#0
   cmp r2,#0
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   moveq pc,lr
   moveq pc,lr
 {$else}
 {$else}
   bxeq  lr
   bxeq  lr
@@ -356,7 +363,7 @@ asm
   ldrb r3,[r0],#1
   ldrb r3,[r0],#1
   strb r3,[r1],#1
   strb r3,[r1],#1
   bne .Lbyteloop
   bne .Lbyteloop
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   mov pc,lr
   mov pc,lr
 {$else}
 {$else}
   bx  lr
   bx  lr
@@ -535,7 +542,7 @@ asm
       terminating 0, due to the known carry flag sbc can do this.*)
       terminating 0, due to the known carry flag sbc can do this.*)
     sbc r0,r1,r0
     sbc r0,r1,r0
 .Ldone:
 .Ldone:
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
     mov pc,lr
     mov pc,lr
 {$else}
 {$else}
     bx  lr
     bx  lr
@@ -848,6 +855,127 @@ begin
 {$endif FPC_SYSTEM_FPC_MOVE}
 {$endif FPC_SYSTEM_FPC_MOVE}
 end;
 end;
 
 
+{$define FPC_SYSTEM_HAS_SWAPENDIAN}
+
+{ SwapEndian(<16 Bit>) being inlined is faster than using assembler }
+function SwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    { the extra Word type cast is necessary because the "AValue shr 8" }
+    { is turned into "longint(AValue) shr 8", so if AValue < 0 then    }
+    { the sign bits from the upper 16 bits are shifted in rather than  }
+    { zeroes.                                                          }
+    Result := SmallInt((Word(AValue) shr 8) or (Word(AValue) shl 8));
+  end;
+
+
+function SwapEndian(const AValue: Word): Word;{$ifdef SYSTEMINLINE}inline;{$endif}
+  begin
+    Result := Word((AValue shr 8) or (AValue shl 8));
+  end;
+
+(*
+This is kept for reference. Thats what the compiler COULD generate in these cases.
+But FPC currently does not support inlining of asm-functions, so the whole call-overhead
+is bigger than the gain of the optimized function.
+function AsmSwapEndian(const AValue: SmallInt): SmallInt;{$ifdef SYSTEMINLINE}inline;{$endif};assembler;nostackframe;
+asm
+	// We're starting with 4321
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+	mov r0, r0, shl #16	// Shift to make that 2100
+	mov r0, r0, ror #24	// Rotate to 1002
+	orr r0, r0, r0 shr #16  // Shift and combine into 0012
+{$else}
+	rev r0, r0		// Reverse byteorder    r0 = 1234
+	mov r0, r0, shr #16	// Shift down to 16bits r0 = 0012
+{$endif}
+end;
+
+*)
+
+function SwapEndian(const AValue: LongInt): LongInt;assembler;nostackframe;
+asm
+        // We're starting with r0 = 4321
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r1, r0, #16711680           // r1 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r1, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r0, r2, r0, lsl #8          // r0 = 1234
+{$else}
+	rev r0, r0
+{$endif}
+end;
+
+function SwapEndian(const AValue: DWord): DWord;assembler;nostackframe;
+asm
+        // We're starting with r0 = 4321
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r1, r0, #16711680           // r1 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r1, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r0, r2, r0, lsl #8          // r0 = 1234
+{$else}
+	rev r0, r0
+{$endif}
+end;
+
+function SwapEndian(const AValue: Int64): Int64; assembler; nostackframe;
+asm
+        // We're starting with r0 = 4321 r1 = 8765
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov ip, r1
+
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r3, r0, #16711680           // r3 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r3, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r1, r2, r0, lsl #8          // r1 = 1234
+
+        mov r2, ip, lsr #24             // r2 = 0008
+        and r3, ip, #16711680           // r1 = 0700
+        orr r2, r2, ip, lsl #24         // r2 = 5008
+        orr r2, r2, r3, lsr #8          // r2 = 5078
+        and ip, ip, #65280              // ip = 0060
+        orr r0, r2, ip, lsl #8          // r0 = 5678
+        bx lr
+{$else}
+	rev r2, r0
+	rev r0, r1
+	mov r1, r2
+{$endif}
+end;
+
+function SwapEndian(const AValue: QWord): QWord; assembler; nostackframe;
+asm
+        // We're starting with r0 = 4321 r1 = 8765
+{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+        mov ip, r1
+
+        mov r2, r0, lsr #24             // r2 = 0004
+        and r3, r0, #16711680           // r3 = 0300
+        orr r2, r2, r0, lsl #24         // r2 = 1004
+        orr r2, r2, r3, lsr #8          // r2 = 1034
+        and r0, r0, #65280              // r0 = 0020
+        orr r1, r2, r0, lsl #8          // r1 = 1234
+
+        mov r2, ip, lsr #24             // r2 = 0008
+        and r3, ip, #16711680           // r1 = 0700
+        orr r2, r2, ip, lsl #24         // r2 = 5008
+        orr r2, r2, r3, lsr #8          // r2 = 5078
+        and ip, ip, #65280              // ip = 0060
+        orr r0, r2, ip, lsl #8          // r0 = 5678
+        bx lr
+{$else}
+	rev r2, r0
+	rev r0, r1
+	mov r1, r2
+{$endif}
+end;
+
 {include hand-optimized assembler division code}
 {include hand-optimized assembler division code}
 {$i divide.inc}
 {$i divide.inc}
 
 

+ 2 - 2
rtl/arm/divide.inc

@@ -175,7 +175,7 @@ asm
 .Ldiv_next:
 .Ldiv_next:
   bcs .Ldiv_loop
   bcs .Ldiv_loop
   mov r0, r3
   mov r0, r3
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   mov pc, lr
   mov pc, lr
 {$else}
 {$else}
   bx  lr
   bx  lr
@@ -184,7 +184,7 @@ asm
   mov r0, #200
   mov r0, #200
   mov r1, r11
   mov r1, r11
   bl handleerrorframe
   bl handleerrorframe
-{$if defined(cpuarmv3) or defined(cpuarmv4) or defined(cpuarmv5)}
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
   mov pc, lr
   mov pc, lr
 {$else}
 {$else}
   bx  lr
   bx  lr

+ 24 - 24
rtl/arm/mathu.inc

@@ -132,17 +132,17 @@ end;
 function ConvertExceptionMask(em: dword): TFPUExceptionMask;
 function ConvertExceptionMask(em: dword): TFPUExceptionMask;
 begin
 begin
   Result:=[];
   Result:=[];
-  if em and _EM_INVALID <> 0 then
+  if em and _EM_INVALID = 0 then
     Result:=Result + [exInvalidOp];
     Result:=Result + [exInvalidOp];
-  if em and _EM_DENORMAL <> 0 then
+  if em and _EM_DENORMAL = 0 then
     Result:=Result + [exDenormalized];
     Result:=Result + [exDenormalized];
-  if em and _EM_ZERODIVIDE <> 0 then
+  if em and _EM_ZERODIVIDE = 0 then
     Result:=Result + [exZeroDivide];
     Result:=Result + [exZeroDivide];
-  if em and _EM_OVERFLOW <> 0 then
+  if em and _EM_OVERFLOW = 0 then
     Result:=Result + [exOverflow];
     Result:=Result + [exOverflow];
-  if em and _EM_UNDERFLOW <> 0 then
+  if em and _EM_UNDERFLOW = 0 then
     Result:=Result + [exUnderflow];
     Result:=Result + [exUnderflow];
-  if em and _EM_INEXACT <> 0 then
+  if em and _EM_INEXACT = 0 then
     Result:=Result + [exPrecision];
     Result:=Result + [exPrecision];
 end;
 end;
 
 
@@ -156,17 +156,17 @@ var
   c: dword;
   c: dword;
 begin
 begin
   c:=0;
   c:=0;
-  if exInvalidOp in Mask then
+  if not(exInvalidOp in Mask) then
     c:=c or _EM_INVALID;
     c:=c or _EM_INVALID;
-  if exDenormalized in Mask then
+  if not(exDenormalized in Mask) then
     c:=c or _EM_DENORMAL;
     c:=c or _EM_DENORMAL;
-  if exZeroDivide in Mask then
+  if not(exZeroDivide in Mask) then
     c:=c or _EM_ZERODIVIDE;
     c:=c or _EM_ZERODIVIDE;
-  if exOverflow in Mask then
+  if not(exOverflow in Mask) then
     c:=c or _EM_OVERFLOW;
     c:=c or _EM_OVERFLOW;
-  if exUnderflow in Mask then
+  if not(exUnderflow in Mask) then
     c:=c or _EM_UNDERFLOW;
     c:=c or _EM_UNDERFLOW;
-  if exPrecision in Mask then
+  if not(exPrecision in Mask) then
     c:=c or _EM_INEXACT;
     c:=c or _EM_INEXACT;
   c:=_controlfp(c, _MCW_EM);
   c:=_controlfp(c, _MCW_EM);
   Result:=ConvertExceptionMask(c);
   Result:=ConvertExceptionMask(c);
@@ -281,22 +281,22 @@ function GetExceptionMask: TFPUExceptionMask;
     Result:=[];
     Result:=[];
     cw:=VFP_GetCW;
     cw:=VFP_GetCW;
 
 
-    if (cw and _VFP_ENABLE_IM)<>0 then
+    if (cw and _VFP_ENABLE_IM)=0 then
       include(Result,exInvalidOp);
       include(Result,exInvalidOp);
 
 
-    if (cw and _VFP_ENABLE_DM)<>0 then
+    if (cw and _VFP_ENABLE_DM)=0 then
       include(Result,exDenormalized);
       include(Result,exDenormalized);
 
 
-    if (cw and _VFP_ENABLE_ZM)<>0 then
+    if (cw and _VFP_ENABLE_ZM)=0 then
       include(Result,exZeroDivide);
       include(Result,exZeroDivide);
 
 
-    if (cw and _VFP_ENABLE_OM)<>0 then
+    if (cw and _VFP_ENABLE_OM)=0 then
       include(Result,exOverflow);
       include(Result,exOverflow);
 
 
-    if (cw and _VFP_ENABLE_UM)<>0 then
+    if (cw and _VFP_ENABLE_UM)=0 then
       include(Result,exUnderflow);
       include(Result,exUnderflow);
 
 
-    if (cw and _VFP_ENABLE_PM)<>0 then
+    if (cw and _VFP_ENABLE_PM)=0 then
       include(Result,exPrecision);
       include(Result,exPrecision);
   end;
   end;
 
 
@@ -308,22 +308,22 @@ function SetExceptionMask(const Mask: TFPUExceptionMask): TFPUExceptionMask;
     cw:=VFP_GetCW and not(_VFP_ENABLE_ALL);
     cw:=VFP_GetCW and not(_VFP_ENABLE_ALL);
 
 
 {$ifndef darwin}
 {$ifndef darwin}
-    if exInvalidOp in Mask then
+    if not(exInvalidOp in Mask) then
       cw:=cw or _VFP_ENABLE_IM;
       cw:=cw or _VFP_ENABLE_IM;
 
 
-    if exDenormalized in Mask then
+    if not(exDenormalized in Mask) then
       cw:=cw or _VFP_ENABLE_DM;
       cw:=cw or _VFP_ENABLE_DM;
 
 
-    if exZeroDivide in Mask then
+    if not(exZeroDivide in Mask) then
       cw:=cw or _VFP_ENABLE_ZM;
       cw:=cw or _VFP_ENABLE_ZM;
 
 
-    if exOverflow in Mask then
+    if not(exOverflow in Mask) then
       cw:=cw or _VFP_ENABLE_OM;
       cw:=cw or _VFP_ENABLE_OM;
 
 
-    if exUnderflow in Mask then
+    if not(exUnderflow in Mask) then
       cw:=cw or _VFP_ENABLE_UM;
       cw:=cw or _VFP_ENABLE_UM;
 
 
-    if exPrecision in Mask then
+    if not(exPrecision in Mask) then
       cw:=cw or _VFP_ENABLE_PM;
       cw:=cw or _VFP_ENABLE_PM;
 {$endif}
 {$endif}
     VFP_SetCW(cw);
     VFP_SetCW(cw);

+ 47 - 0
rtl/arm/strings.inc

@@ -15,3 +15,50 @@
 
 
  **********************************************************************}
  **********************************************************************}
 
 
+{$ifndef FPC_UNIT_HAS_STRUPPER}
+{$define FPC_UNIT_HAS_STRUPPER}
+function strupper(p : pchar) : pchar;assembler;nostackframe;
+asm
+        mov     ip, r0   // Don't change r0, because thats our return value
+
+        ldrb    r1, [ip] // First loop does not postindex
+.LByteLoop:
+        cmp     r1, #0
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
+        moveq   pc, lr
+{$else}
+        bxeq    lr
+{$endif}
+
+        sub     r2, r1, #97   // Normalize to zero
+        cmp     r2, #25       // temp >= 0 and temp <=25
+        subls   r1, r1, #32   // is lowercase, make uppercase
+        strlsb  r1, [ip]      // Store only on change
+        ldrb    r1, [ip, #1]! // Loading here utilizes a load delay slot
+        b       .LByteLoop
+end;
+{$endif FPC_UNIT_HAS_STRUPPER}
+
+{$ifndef FPC_UNIT_HAS_STRLOWER}
+{$define FPC_UNIT_HAS_STRLOWER}
+function strlower(p : pchar) : pchar;assembler;nostackframe;
+asm
+        mov     ip, r0   // Don't change r0, because thats our return value
+
+        ldrb    r1, [ip] // First loop does not postindex
+.LByteLoop:
+        cmp     r1, #0
+{$if defined(cpuarmv3) or defined(cpuarmv4)}
+        moveq   pc, lr
+{$else}
+        bxeq    lr
+{$endif}
+
+        sub     r2, r1, #65   // Normalize to zero
+        cmp     r2, #25       // temp >= 0 and temp <=25
+        addls   r1, r1, #32   // Is uppercase, make lowercase
+        strlsb  r1, [ip]      // Store only on change
+        ldrb    r1, [ip, #1]! // Loading here utilizes a load delay slot
+        b       .LByteLoop
+end;
+{$endif FPC_UNIT_HAS_STRLOWER}