fix arm32 builds when using clang's integrated assembler (syntax-only fix for v1.2 until v1.3 is released)

--- dyncall/dyncall_call_arm32_arm_armhf.S.orig	2021-01-22 15:43:00 UTC
+++ dyncall/dyncall_call_arm32_arm_armhf.S
@@ -59,7 +59,7 @@ ENTRY_C(dcCall_arm32_armhf)
 	add r5,  r1, #16 /* r5 = stack args (after intreg ones) */
 
 	/* Load 16 single-precision registers (= 8 double-precision registers). */
-	fldmiad r3, {d0-d7}
+	vldmia r3, {d0-d7}
 
 	/* prep stack parameter area (includes room for spill area, callee spills if needed) */
 	sub r13, r13, r2
@@ -77,7 +77,7 @@ armhf_pushArgs:
 
 armhf_call:
 	ldmia r1, {r0-r3}  /* Load first 4 arguments for new call into r0-r3. */
-	                   /* 'blx %r4' workaround for ARMv4t: */
+	                   /* 'blx r4' workaround for ARMv4t: */
 	mov r14, r15       /* Branch return address(r15) -> link register (r14) -- r15 always points to address of current + 2 instructions (= Epilog code). */ 
 	bx  r4             /* Call (ARM/THUMB), available for ARMv4t. */
 
--- dyncall/dyncall_call_arm32_thumb_armhf.S.orig	2021-01-22 15:43:00 UTC
+++ dyncall/dyncall_call_arm32_thumb_armhf.S
@@ -68,7 +68,7 @@ ENTRY_C(dcCall_arm32_armhf)
 	mov	r5  , r1	 /* r5 = 'args' (2nd argument is passed in r1). */
 	
 	/* Load 16 single-precision registers (= 8 double-precision registers). */
-	fldmiad	r3, {d0-d7}
+	vldmia	r3, {d0-d7}
 
 	sub	r2 , #16	
 	cmp     r2, #0
@@ -99,7 +99,7 @@ armhf_pushArgs:
 
 armhf_call:
 	ldmia	r5!, {r0-r3}	/* Load first 4 arguments for new call into r0-r3. */
-				/* 'blx %r4' workaround for ARMv4t: */
+				/* 'blx r4' workaround for ARMv4t: */
 	// mov	r14, r15	/*   Branch return address(r15) -> link register (r14) -- r15 always points to address of current + 2 instructions (= Epilog code). */ 
 	mov	r6, r15
 	add	r6, #5
--- dyncall/dyncall_call_arm32_thumb_gas.s.orig	2021-01-22 15:43:00 UTC
+++ dyncall/dyncall_call_arm32_thumb_gas.s
@@ -37,45 +37,45 @@ dcCall_arm32_thumb:
 
 	/* Prolog. This function never needs to spill inside its prolog, so just store the permanent registers. */
 	/* Code below is not using high registers, so not storing them in prolog, which is more involved with thumb, anyways. */
-	push	{%r4-%r7, %r14}		/* Frame ptr, permanent registers, link register -> save area on stack. */
-	mov		%r7, %r13	/* Set frame ptr. */
-	sub		%r13, #4	/* Realign stack to 8 bytes (b/c we stored 5 regs = 20b). */
+	push	{r4-r7, r14}		/* Frame ptr, permanent registers, link register -> save area on stack. */
+	mov		r7, r13	/* Set frame ptr. */
+	sub		r13, #4	/* Realign stack to 8 bytes (b/c we stored 5 regs = 20b). */
 
 	/* Call. */
-	mov		%r4, %r0	/* Move 'fptr' to r4 (1st argument is passed in r0). */
-	mov		%r5, %r1	/* Move 'args' to r5 (2nd argument is passed in r1). */
-	mov		%r6, %r2	/* Move 'size' to r6 (3rd argument is passed in r2). */
+	mov		r4, r0	/* Move 'fptr' to r4 (1st argument is passed in r0). */
+	mov		r5, r1	/* Move 'args' to r5 (2nd argument is passed in r1). */
+	mov		r6, r2	/* Move 'size' to r6 (3rd argument is passed in r2). */
 
-	cmp		%r6, #16	/* Jump to call if no more than 4 arguments. */
+	cmp		r6, #16	/* Jump to call if no more than 4 arguments. */
 	ble		call
 
-	sub		%r6, #16	/* Size of remaining arguments. */
-	mov		%r0, %r13	/* Set stack pointer to top of stack. */
-	sub		%r0, %r0, %r6
-	lsr		%r0, #3		/* Align stack on 8 byte boundaries. */
-	lsl		%r0, #3
-	mov		%r13, %r0
+	sub		r6, #16	/* Size of remaining arguments. */
+	mov		r0, r13	/* Set stack pointer to top of stack. */
+	sub		r0, r0, r6
+	lsr		r0, #3		/* Align stack on 8 byte boundaries. */
+	lsl		r0, #3
+	mov		r13, r0
 
-	add		%r1, #16	/* Let r1 point to remaining arguments. */
-	mov		%r2, #0		/* Init byte counter to 0. */
+	add		r1, #16	/* Let r1 point to remaining arguments. */
+	mov		r2, #0		/* Init byte counter to 0. */
 .thumb_func
 pushArgs:
-	ldrb	%r3, [%r1, %r2]		/* Load a byte into r3. */
-	strb	%r3, [%r0, %r2]		/* Push byte onto stack. */
-	add		%r2, %r2, #1	/* Increment byte counter. */
-	cmp		%r2, %r6
+	ldrb	r3, [r1, r2]		/* Load a byte into r3. */
+	strb	r3, [r0, r2]		/* Push byte onto stack. */
+	add		r2, r2, #1	/* Increment byte counter. */
+	cmp		r2, r6
 	bne		pushArgs
 .thumb_func
 call:
-	ldmia	%r5!, {%r0-%r3}		/* Load first 4 arguments for new call into r0-r3. */
+	ldmia	r5!, {r0-r3}		/* Load first 4 arguments for new call into r0-r3. */
 					
-					/* 'blx %r4' workaround for ARMv4t in THUMB: */
-	mov		%r6,  %r15	/* Load PC+2 instructions from here */
-	add		%r6,  #5	/* Increment by 2 instructions (Address of 'Epilog') and set bit 0 (THUMB) */
-	mov		%r14, %r6	/* Store in link register. */
-	bx		%r4		/* Branch and force THUMB-mode return (LR bit 0 set). */
+					/* 'blx r4' workaround for ARMv4t in THUMB: */
+	mov		r6,  r15	/* Load PC+2 instructions from here */
+	add		r6,  #5	/* Increment by 2 instructions (Address of 'Epilog') and set bit 0 (THUMB) */
+	mov		r14, r6	/* Store in link register. */
+	bx		r4		/* Branch and force THUMB-mode return (LR bit 0 set). */
 
 	/* Epilog. */
-	mov		%r13, %r7	/* Reset stack ptr. */
-	pop		{%r4-%r7, %r15}	/* Restore permanent registers and program counter. (Force a stay in THUMB in ARMv4, whether ARMv5 can return in ARM or THUMB depending on the bit 0. */
+	mov		r13, r7	/* Reset stack ptr. */
+	pop		{r4-r7, r15}	/* Restore permanent registers and program counter. (Force a stay in THUMB in ARMv4, whether ARMv5 can return in ARM or THUMB depending on the bit 0. */
 
--- dyncallback/dyncall_callback_arm32_arm_gas.S.orig	2021-01-22 15:43:00 UTC
+++ dyncallback/dyncall_callback_arm32_arm_gas.S
@@ -47,37 +47,37 @@ CTX_userdata    =  16
 dcCallbackThunkEntry:
 
 	/* Prolog. This function never needs to spill inside its prolog, so just store the permanent registers. */
-	stmdb	%r13, {%r4-%r11, %r13, %r14}	/* Permanent registers and stack pointer, etc... -> save area on stack (except counter). */
-	mov		%r11, %r13						/* Set frame pointer. */
-	sub		%r13, %r13, #40					/* Adjust stack pointer. */
+	stmdb	r13, {r4-r11, r13, r14}	/* Permanent registers and stack pointer, etc... -> save area on stack (except counter). */
+	mov		r11, r13						/* Set frame pointer. */
+	sub		r13, r13, #40					/* Adjust stack pointer. */
 
 	/* Grab arguments. */
-	mov		%r4, #0
+	mov		r4, #0
 #if defined(DC__ABI_ARM_HF)
-	stmdb	%r13!, {%r4}					/* Init freg_count and dreg_count to 0 */
-	stmdb	%r13!, {%r4}
-	fstmdbd	%r13!, {d0-d7}					/* Store all fp-registers in DCArgs' f[16] */
+	stmdb	r13!, {r4}					/* Init freg_count and dreg_count to 0 */
+	stmdb	r13!, {r4}
+	vstmdb	r13!, {d0-d7}					/* Store all fp-registers in DCArgs' f[16] */
 #endif
-	stmdb	%r13!, {%r0-%r4, %r11}			/* Spill first 4 args to DCArgs, along with reg_count (init to 0) and (stack) pointer to remaining args. */
+	stmdb	r13!, {r0-r4, r11}			/* Spill first 4 args to DCArgs, along with reg_count (init to 0) and (stack) pointer to remaining args. */
 
 	/* Prepare callback handler call. */
-	mov		%r0, %r12						/* Parameter 0 (r0) = DCCallback pointer (r12, pointer to thunk). */
-	mov		%r1, %r13						/* Parameter 1 (r1) = DCArgs pointer (r13, stack pointer). */
-	sub		%r13, %r13, #DCValue_size		/* Make room for return value. */
-	mov		%r2, %r13						/* Parameter 2 (r2) = results pointer. */
-	ldr		%r3, [%r12, #CTX_userdata]		/* Parameter 3 (r3) = userdata pointer. */
+	mov		r0, r12						/* Parameter 0 (r0) = DCCallback pointer (r12, pointer to thunk). */
+	mov		r1, r13						/* Parameter 1 (r1) = DCArgs pointer (r13, stack pointer). */
+	sub		r13, r13, #DCValue_size		/* Make room for return value. */
+	mov		r2, r13						/* Parameter 2 (r2) = results pointer. */
+	ldr		r3, [r12, #CTX_userdata]		/* Parameter 3 (r3) = userdata pointer. */
 
 	/* Call. */
-	ldr		%r4, [%r12, #CTX_handler]		/* Load callback handler pointer into r4. */
-	mov		%r14, %r15						/* Branch return address(r15) -> link register (r14) -- r15 always points to address of current + 2 instructions (= Epilog code). */
-	bx		%r4								/* Call. */
+	ldr		r4, [r12, #CTX_handler]		/* Load callback handler pointer into r4. */
+	mov		r14, r15						/* Branch return address(r15) -> link register (r14) -- r15 always points to address of current + 2 instructions (= Epilog code). */
+	bx		r4								/* Call. */
 
 	/* Return value. */
-	ldmia	%r13, {%r0, %r1}				/* Load return value in r0 and r1. */
+	ldmia	r13, {r0, r1}				/* Load return value in r0 and r1. */
 #if defined(DC__ABI_ARM_HF)
-	fldmiad	%r13, {%d0}						/* Same for floating point return value (if any). */
+	vldmia	r13, {d0}						/* Same for floating point return value (if any). */
 #endif
 
 	/* Epilog. */
-	ldmdb	%r11, {%r4-%r11, %r13, %r15}	/* Restore permanent registers (restore stack ptr and program counter).@@@db not needed since we rewrite r13? */
+	ldmdb	r11, {r4-r11, r13, r15}	/* Restore permanent registers (restore stack ptr and program counter).@@@db not needed since we rewrite r13? */
 
