diff --git a/src/core/arm/dyncom/arm_dyncom_dec.cpp b/src/core/arm/dyncom/arm_dyncom_dec.cpp
index 333b40f54..0927eece1 100644
--- a/src/core/arm/dyncom/arm_dyncom_dec.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_dec.cpp
@@ -136,7 +136,6 @@ const ISEITEM arm_instruction[] = {
     { "pkhbt", 2, 6, 20, 27, 0x00000068, 4, 6, 0x00000001 },
     { "smul", 3, 4, 20, 27, 0x00000016, 7, 7, 0x00000001, 4, 4, 0x00000000 },
     { "smlalxy", 3, 4, 20, 27, 0x00000014, 7, 7, 0x00000001, 4, 4, 0x00000000 },
-    //	{"smlal"	,  2	,  4	, 21, 27, 0x00000007,  4,  7, 0x00000009},
     { "smla", 3, 4, 20, 27, 0x00000010, 7, 7, 0x00000001, 4, 4, 0x00000000 },
     { "mcrr", 1, 6, 20, 27, 0x000000c4 },
     { "mrrc", 1, 6, 20, 27, 0x000000c5 },
@@ -194,6 +193,10 @@ const ISEITEM arm_instruction[] = {
     { "ldc", 2, 0, 25, 27, 0x00000006, 20, 20, 0x00000001 },
     { "swi", 1, 0, 24, 27, 0x0000000f },
     { "bbl", 1, 0, 25, 27, 0x00000005 },
+    { "ldrexd", 2, ARMV6K, 20, 27, 0x0000001B, 4, 7, 0x00000009 },
+    { "strexd", 2, ARMV6K, 20, 27, 0x0000001A, 4, 7, 0x00000009 },
+    { "ldrexh", 2, ARMV6K, 20, 27, 0x0000001F, 4, 7, 0x00000009 },
+    { "strexh", 2, ARMV6K, 20, 27, 0x0000001E, 4, 7, 0x00000009 },
 };
 
 const ISEITEM arm_exclusion_code[] = {
@@ -383,6 +386,11 @@ const ISEITEM arm_exclusion_code[] = {
     { "ldc", 0, 0, 0 },
     { "swi", 0, 0, 0 },
     { "bbl", 0, 0, 0 },
+    { "ldrexd", 0, ARMV6K, 0 },
+    { "strexd", 0, ARMV6K, 0 },
+    { "ldrexh", 0, ARMV6K, 0 },
+    { "strexh", 0, ARMV6K, 0 },
+
     { "bl_1_thumb", 0, INVALID, 0 },    // Should be table[-4]
     { "bl_2_thumb", 0, INVALID, 0 },    // Should be located at the end of the table[-3]
     { "blx_1_thumb", 0, INVALID, 0 },   // Should be located at table[-2]
@@ -395,6 +403,7 @@ int decode_arm_instr(uint32_t instr, int32_t *idx) {
     int ret = DECODE_FAILURE;
     int i = 0;
     int instr_slots = sizeof(arm_instruction) / sizeof(ISEITEM);
+
     for (i = 0; i < instr_slots; i++) {
         n = arm_instruction[i].attribute_value;
         base = 0;
diff --git a/src/core/arm/dyncom/arm_dyncom_dec.h b/src/core/arm/dyncom/arm_dyncom_dec.h
index 70eb96e93..58784aeea 100644
--- a/src/core/arm/dyncom/arm_dyncom_dec.h
+++ b/src/core/arm/dyncom/arm_dyncom_dec.h
@@ -1,153 +1,117 @@
-/* Copyright (C) 
-* 2012 - Michael.Kang blackfin.kang@gmail.com
-* This program is free software; you can redistribute it and/or
-* modify it under the terms of the GNU General Public License
-* as published by the Free Software Foundation; either version 2
-* of the License, or (at your option) any later version.
-* 
-* This program is distributed in the hope that it will be useful,
-* but WITHOUT ANY WARRANTY; without even the implied warranty of
-* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-* GNU General Public License for more details.
-* 
-* You should have received a copy of the GNU General Public License
-* along with this program; if not, write to the Free Software
-* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
-* 
-*/
+// Copyright 2012 Michael Kang, 2015 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
 
-/**
-* @file arm_dyncom_dec.h
-* @brief Some common utility for arm instruction decoder
-* @author Michael.Kang blackfin.kang@gmail.com
-* @version 7849
-* @date 2012-03-15
-*/
+#pragma once
 
-#ifndef __ARM_DYNCOM_DEC__
-#define __ARM_DYNCOM_DEC__
+#define BITS(a,b)   ((instr >> (a)) & ((1 << (1+(b)-(a)))-1))
+#define BIT(n)      ((instr >> (n)) & 1)
+#define BAD         do { printf("meet BAD at %s, instr is %x\n", __FUNCTION__, instr ); } while(0);
+#define ptr_N       cpu->ptr_N
+#define ptr_Z       cpu->ptr_Z
+#define ptr_C       cpu->ptr_C
+#define ptr_V       cpu->ptr_V
+#define ptr_I       cpu->ptr_I
+#define ptr_T       cpu->ptr_T
+#define ptr_CPSR    cpu->ptr_gpr[16]
 
-#define BITS(a,b) ((instr >> (a)) & ((1 << (1+(b)-(a)))-1))
-#define BIT(n) ((instr >> (n)) & 1)
-#define BAD	do{printf("meet BAD at %s, instr is %x\n", __FUNCTION__, instr ); /*exit(0);*/}while(0);
-#define ptr_N	cpu->ptr_N
-#define ptr_Z	cpu->ptr_Z
-#define ptr_C	cpu->ptr_C
-#define ptr_V	cpu->ptr_V
-#define ptr_I 	cpu->ptr_I
-#define ptr_T 	cpu->ptr_T
-#define	ptr_CPSR cpu->ptr_gpr[16]
+// For MUL instructions
+#define RDHi        ((instr >> 16) & 0xF)
+#define RDLo        ((instr >> 12) & 0xF)
+#define MUL_RD      ((instr >> 16) & 0xF)
+#define MUL_RN      ((instr >> 12) & 0xF)
+#define RS          ((instr >> 8) & 0xF)
+#define RD          ((instr >> 12) & 0xF)
+#define RN          ((instr >> 16) & 0xF)
+#define RM          (instr & 0xF)
 
-/* for MUL instructions */
-/*xxxx xxxx xxxx 1111 xxxx xxxx xxxx xxxx */
-#define RDHi ((instr >> 16) & 0xF)
-/*xxxx xxxx xxxx xxxx 1111 xxxx xxxx xxxx */
-#define RDLo ((instr >> 12) & 0xF)
-/*xxxx xxxx xxxx 1111 xxxx xxxx xxxx xxxx */
-#define MUL_RD ((instr >> 16) & 0xF)
-/*xxxx xxxx xxxx xxxx 1111 xxxx xxxx xxxx */
-#define MUL_RN ((instr >> 12) & 0xF)
-/*xxxx xxxx xxxx xxxx xxxx 1111 xxxx xxxx */
-#define RS ((instr >> 8) & 0xF)
+// CP15 registers
+#define OPCODE_1    BITS(21, 23)
+#define CRn         BITS(16, 19)
+#define CRm         BITS(0, 3)
+#define OPCODE_2    BITS(5, 7)
 
-/*xxxx xxxx xxxx xxxx 1111 xxxx xxxx xxxx */
-#define RD ((instr >> 12) & 0xF)
-/*xxxx xxxx xxxx 1111 xxxx xxxx xxxx xxxx */
-#define RN ((instr >> 16) & 0xF)
-/*xxxx xxxx xxxx xxxx xxxx xxxx xxxx 1111 */
-#define RM (instr & 0xF)
+#define I           BIT(25)
+#define S           BIT(20)
 
-/* CP15 registers */
-#define OPCODE_1        BITS(21, 23)
-#define CRn             BITS(16, 19)
-#define CRm             BITS(0, 3)
-#define OPCODE_2        BITS(5, 7)
+#define             SHIFT BITS(5,6)
+#define             SHIFT_IMM BITS(7,11)
+#define             IMMH BITS(8,11)
+#define             IMML BITS(0,3)
 
-/*xxxx xx1x xxxx xxxx xxxx xxxx xxxx xxxx */
-#define I BIT(25)
-/*xxxx xxxx xxx1 xxxx xxxx xxxx xxxx xxxx */
-#define S BIT(20)
+#define LSPBIT      BIT(24)
+#define LSUBIT      BIT(23)
+#define LSBBIT      BIT(22)
+#define LSWBIT      BIT(21)
+#define LSLBIT      BIT(20)
+#define LSSHBITS    BITS(5,6)
+#define OFFSET12    BITS(0,11)
+#define SBIT        BIT(20)
+#define DESTReg     (BITS (12, 15))
 
-#define SHIFT BITS(5,6)
-#define SHIFT_IMM BITS(7,11)
-#define IMMH BITS(8,11)
-#define IMML BITS(0,3)
-
-#define LSPBIT  BIT(24)
-#define LSUBIT  BIT(23)
-#define LSBBIT  BIT(22)
-#define LSWBIT  BIT(21)
-#define LSLBIT  BIT(20)
-#define LSSHBITS BITS(5,6)
-#define OFFSET12 BITS(0,11)
-#define SBIT  BIT(20)
-#define DESTReg (BITS (12, 15))
-
-/* they are in unused state, give a corrent value when using */
+// They are in unused state, give a corrent value when using
 #define IS_V5E 0
 #define IS_V5  0
 #define IS_V6  0
 #define LHSReg 0
 
-/* temp define the using the pc reg need implement a flow */
-#define STORE_CHECK_RD_PC	ADD(R(RD), CONST(INSTR_SIZE * 2))
+// Temp define the using the pc reg need implement a flow
+#define STORE_CHECK_RD_PC   ADD(R(RD), CONST(INSTR_SIZE * 2))
 
-#define OPERAND operand(cpu,instr,bb,NULL)
-#define SCO_OPERAND(sco) operand(cpu,instr,bb,sco)
-#define BOPERAND boperand(instr)
+#define OPERAND             operand(cpu,instr,bb,NULL)
+#define SCO_OPERAND(sco)    operand(cpu,instr,bb,sco)
+#define BOPERAND            boperand(instr)
 
-#define CHECK_RN_PC  (RN==15? ADD(AND(R(RN), CONST(~0x1)), CONST(INSTR_SIZE * 2)):R(RN))
-#define CHECK_RN_PC_WA  (RN==15? ADD(AND(R(RN), CONST(~0x3)), CONST(INSTR_SIZE * 2)):R(RN))
+#define CHECK_RN_PC         (RN == 15 ? ADD(AND(R(RN), CONST(~0x1)), CONST(INSTR_SIZE * 2)) : R(RN))
+#define CHECK_RN_PC_WA      (RN == 15 ? ADD(AND(R(RN), CONST(~0x3)), CONST(INSTR_SIZE * 2)) : R(RN))
 
-#define GET_USER_MODE() (OR(ICMP_EQ(R(MODE_REG), CONST(USER32MODE)), ICMP_EQ(R(MODE_REG), CONST(SYSTEM32MODE))))
+#define GET_USER_MODE()     (OR(ICMP_EQ(R(MODE_REG), CONST(USER32MODE)), ICMP_EQ(R(MODE_REG), CONST(SYSTEM32MODE))))
 
 int decode_arm_instr(uint32_t instr, int32_t *idx);
 
 enum DECODE_STATUS {
-	DECODE_SUCCESS,
-	DECODE_FAILURE
+    DECODE_SUCCESS,
+    DECODE_FAILURE
 };
 
 struct instruction_set_encoding_item {
-        const char *name;
-        int attribute_value;
-        int version;
-        u32 content[21];
+    const char *name;
+    int attribute_value;
+    int version;
+    u32 content[21];
 };
 
 typedef struct instruction_set_encoding_item ISEITEM;
 
-#define RECORD_WB(value, flag) {cpu->dyncom_engine->wb_value = value;cpu->dyncom_engine->wb_flag = flag;}
+#define RECORD_WB(value, flag) { cpu->dyncom_engine->wb_value = value;cpu->dyncom_engine->wb_flag = flag; }
 #define INIT_WB(wb_value, wb_flag) RECORD_WB(wb_value, wb_flag)
 
-#define EXECUTE_WB(base_reg)		{if(cpu->dyncom_engine->wb_flag) \
-                                               LET(base_reg, cpu->dyncom_engine->wb_value);}
-inline int get_reg_count(uint32_t instr){
-	int i =  BITS(0,15);
-	int count = 0;
-	while(i){
-		if(i & 1)
-			count ++;
-		i = i >> 1;
-	}
-	return count;
+#define EXECUTE_WB(base_reg) { if(cpu->dyncom_engine->wb_flag) LET(base_reg, cpu->dyncom_engine->wb_value); }
+
+inline int get_reg_count(uint32_t instr) {
+    int i = BITS(0, 15);
+    int count = 0;
+    while (i) {
+        if (i & 1)
+            count++;
+        i = i >> 1;
+    }
+    return count;
 }
 
 enum ARMVER {
-	INVALID = 0,
-        ARMALL,
-        ARMV4,
-        ARMV4T,
-        ARMV5T,
-        ARMV5TE,
-        ARMV5TEJ,
-        ARMV6,
-	ARM1176JZF_S,
-        ARMVFP2,
-        ARMVFP3
+    INVALID = 0,
+    ARMALL,
+    ARMV4,
+    ARMV4T,
+    ARMV5T,
+    ARMV5TE,
+    ARMV5TEJ,
+    ARMV6,
+    ARM1176JZF_S,
+    ARMVFP2,
+    ARMVFP3,
+    ARMV6K,
 };
 
-//extern const INSTRACT arm_instruction_action[];
 extern const ISEITEM arm_instruction[];
-
-#endif
diff --git a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
index bc55a082f..5f09d8580 100644
--- a/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
+++ b/src/core/arm/dyncom/arm_dyncom_interpreter.cpp
@@ -622,9 +622,7 @@ void LdnStM(DecrementAfter)(arm_processor *cpu, unsigned int inst, unsigned int
     }
     unsigned int rn = CHECK_READ_REG15_WA(cpu, Rn);
     unsigned int start_addr = rn - count * 4 + 4;
-    unsigned int end_addr   = rn;
 
-    virt_addr = end_addr;
     virt_addr = start_addr;
 
     if (CondPassed(cpu, BITS(inst, 28, 31)) && BIT(inst, 21)) {
@@ -1104,10 +1102,10 @@ typedef struct _blx_1_thumb {
 }blx_1_thumb;
 
 typedef struct _pkh_inst {
-    u32 Rm;
-    u32 Rn;
-    u32 Rd;
-    u8 imm;
+    unsigned int Rm;
+    unsigned int Rn;
+    unsigned int Rd;
+    unsigned char imm;
 } pkh_inst;
 
 typedef arm_inst * ARM_INST_PTR;
@@ -1740,40 +1738,31 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(ldrd)(unsigned int inst, int index)
 
     return inst_base;
 }
-
 ARM_INST_PTR INTERPRETER_TRANSLATE(ldrex)(unsigned int inst, int index)
 {
-    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(ldst_inst));
-    ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
+    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(generic_arm_inst));
+    generic_arm_inst *inst_cream = (generic_arm_inst *)inst_base->component;
 
     inst_base->cond = BITS(inst, 28, 31);
-    inst_base->idx     = index;
-    inst_base->br     = NON_BRANCH;
+    inst_base->idx  = index;
+    inst_base->br   = (BITS(inst, 12, 15) == 15) ? INDIRECT_BRANCH : NON_BRANCH; // Branch if dest is R15
 
-    inst_cream->inst = inst;
-    //inst_cream->get_addr = get_calc_addr_op(inst);
+    inst_cream->Rn = BITS(inst, 16, 19);
+    inst_cream->Rd = BITS(inst, 12, 15);
 
-    if (BITS(inst, 12, 15) == 15) {
-        inst_base->br = INDIRECT_BRANCH;
-    }
     return inst_base;
 }
 ARM_INST_PTR INTERPRETER_TRANSLATE(ldrexb)(unsigned int inst, int index)
 {
-    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(ldst_inst));
-    ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
-
-    inst_base->cond = BITS(inst, 28, 31);
-    inst_base->idx     = index;
-    inst_base->br     = NON_BRANCH;
-
-    inst_cream->inst = inst;
-    inst_cream->get_addr = get_calc_addr_op(inst);
-
-    if (BITS(inst, 12, 15) == 15) {
-        inst_base->br = INDIRECT_BRANCH;
-    }
-    return inst_base;
+    return INTERPRETER_TRANSLATE(ldrex)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(ldrexh)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(ldrex)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(ldrexd)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(ldrex)(inst, index);
 }
 ARM_INST_PTR INTERPRETER_TRANSLATE(ldrh)(unsigned int inst, int index)
 {
@@ -2623,37 +2612,30 @@ ARM_INST_PTR INTERPRETER_TRANSLATE(strd)(unsigned int inst, int index){
 }
 ARM_INST_PTR INTERPRETER_TRANSLATE(strex)(unsigned int inst, int index)
 {
-    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(ldst_inst));
-    ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
+    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(generic_arm_inst));
+    generic_arm_inst *inst_cream = (generic_arm_inst *)inst_base->component;
 
     inst_base->cond = BITS(inst, 28, 31);
-    inst_base->idx     = index;
-    inst_base->br     = NON_BRANCH;
+    inst_base->idx  = index;
+    inst_base->br   = NON_BRANCH;
 
-    inst_cream->inst = inst;
-    inst_cream->get_addr = get_calc_addr_op(inst);
+    inst_cream->Rn  = BITS(inst, 16, 19);
+    inst_cream->Rd  = BITS(inst, 12, 15);
+    inst_cream->Rm  = BITS(inst, 0,   3);
 
-    if (BITS(inst, 12, 15) == 15) {
-        inst_base->br = INDIRECT_BRANCH;
-    }
     return inst_base;
 }
 ARM_INST_PTR INTERPRETER_TRANSLATE(strexb)(unsigned int inst, int index)
 {
-    arm_inst *inst_base = (arm_inst *)AllocBuffer(sizeof(arm_inst) + sizeof(ldst_inst));
-    ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
-
-    inst_base->cond = BITS(inst, 28, 31);
-    inst_base->idx     = index;
-    inst_base->br     = NON_BRANCH;
-
-    inst_cream->inst = inst;
-    inst_cream->get_addr = get_calc_addr_op(inst);
-
-    if (BITS(inst, 12, 15) == 15) {
-        inst_base->br = INDIRECT_BRANCH;
-    }
-    return inst_base;
+    return INTERPRETER_TRANSLATE(strex)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(strexh)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(strex)(inst, index);
+}
+ARM_INST_PTR INTERPRETER_TRANSLATE(strexd)(unsigned int inst, int index)
+{
+    return INTERPRETER_TRANSLATE(strex)(inst, index);
 }
 ARM_INST_PTR INTERPRETER_TRANSLATE(strh)(unsigned int inst, int index)
 {
@@ -3355,6 +3337,11 @@ const transop_fp_t arm_instruction_trans[] = {
     INTERPRETER_TRANSLATE(ldc),
     INTERPRETER_TRANSLATE(swi),
     INTERPRETER_TRANSLATE(bbl),
+    INTERPRETER_TRANSLATE(ldrexd),
+    INTERPRETER_TRANSLATE(strexd),
+    INTERPRETER_TRANSLATE(ldrexh),
+    INTERPRETER_TRANSLATE(strexh),
+
     // All the thumb instructions should be placed the end of table
     INTERPRETER_TRANSLATE(b_2_thumb), 
     INTERPRETER_TRANSLATE(b_cond_thumb), 
@@ -3551,6 +3538,7 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
     #define CRm             inst_cream->crm
     #define CP15_REG(n)     cpu->CP15[CP15(n)]
     #define RD              cpu->Reg[inst_cream->Rd]
+    #define RD2             cpu->Reg[inst_cream->Rd + 1]
     #define RN              cpu->Reg[inst_cream->Rn]
     #define RM              cpu->Reg[inst_cream->Rm]
     #define RS              cpu->Reg[inst_cream->Rs]
@@ -3762,14 +3750,18 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
     case 182: goto LDC_INST; \
     case 183: goto SWI_INST; \
     case 184: goto BBL_INST; \
-    case 185: goto B_2_THUMB ; \
-    case 186: goto B_COND_THUMB ; \
-    case 187: goto BL_1_THUMB ; \
-    case 188: goto BL_2_THUMB ; \
-    case 189: goto BLX_1_THUMB ; \
-    case 190: goto DISPATCH; \
-    case 191: goto INIT_INST_LENGTH; \
-    case 192: goto END; \
+    case 185: goto LDREXD_INST; \
+    case 186: goto STREXD_INST; \
+    case 187: goto LDREXH_INST; \
+    case 188: goto STREXH_INST; \
+    case 189: goto B_2_THUMB ; \
+    case 190: goto B_COND_THUMB ; \
+    case 191: goto BL_1_THUMB ; \
+    case 192: goto BL_2_THUMB ; \
+    case 193: goto BLX_1_THUMB ; \
+    case 194: goto DISPATCH; \
+    case 195: goto INIT_INST_LENGTH; \
+    case 196: goto END; \
     }
 #endif
 
@@ -3830,8 +3822,9 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
         &&MLA_INST,&&SSAT_INST,&&USAT_INST,&&MRS_INST,&&MSR_INST,&&AND_INST,&&BIC_INST,&&LDM_INST,&&EOR_INST,&&ADD_INST,&&RSB_INST,&&RSC_INST,
         &&SBC_INST,&&ADC_INST,&&SUB_INST,&&ORR_INST,&&MVN_INST,&&MOV_INST,&&STM_INST,&&LDM_INST,&&LDRSH_INST,&&STM_INST,&&LDM_INST,&&LDRSB_INST,
         &&STRD_INST,&&LDRH_INST,&&STRH_INST,&&LDRD_INST,&&STRT_INST,&&STRBT_INST,&&LDRBT_INST,&&LDRT_INST,&&MRC_INST,&&MCR_INST,&&MSR_INST,
-        &&LDRB_INST,&&STRB_INST,&&LDR_INST,&&LDRCOND_INST, &&STR_INST,&&CDP_INST,&&STC_INST,&&LDC_INST,&&SWI_INST,&&BBL_INST,&&B_2_THUMB, &&B_COND_THUMB, 
-        &&BL_1_THUMB, &&BL_2_THUMB, &&BLX_1_THUMB, &&DISPATCH,&&INIT_INST_LENGTH,&&END
+        &&LDRB_INST,&&STRB_INST,&&LDR_INST,&&LDRCOND_INST, &&STR_INST,&&CDP_INST,&&STC_INST,&&LDC_INST,&&SWI_INST,&&BBL_INST,&&LDREXD_INST,
+        &&STREXD_INST,&&LDREXH_INST,&&STREXH_INST,&&B_2_THUMB, &&B_COND_THUMB,&&BL_1_THUMB, &&BL_2_THUMB, &&BLX_1_THUMB, &&DISPATCH,
+        &&INIT_INST_LENGTH,&&END
         };
 #endif
     arm_inst * inst_base;
@@ -4432,45 +4425,84 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
 
     LDREX_INST:
     {
-        ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
         if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
-            addr = cpu->Reg[BITS(inst_cream->inst, 16, 19)];
+            unsigned int read_addr = RN;
 
-            unsigned int value = Memory::Read32(addr);
-
-            add_exclusive_addr(cpu, addr);
+            add_exclusive_addr(cpu, read_addr);
             cpu->exclusive_state = 1;
 
-            cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            if (BITS(inst_cream->inst, 12, 15) == 15) {
-                INC_PC(sizeof(ldst_inst));
+            RD = Memory::Read32(read_addr);
+            if (inst_cream->Rd == 15) {
+                INC_PC(sizeof(generic_arm_inst));
                 goto DISPATCH;
             }
         }
         cpu->Reg[15] += GET_INST_SIZE(cpu);
-        INC_PC(sizeof(ldst_inst));
+        INC_PC(sizeof(generic_arm_inst));
         FETCH_INST;
         GOTO_NEXT_INST;
     }
     LDREXB_INST:
     {
-        ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
         if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
-            addr = cpu->Reg[BITS(inst_cream->inst, 16, 19)];
+            unsigned int read_addr = RN;
 
-            unsigned int value = Memory::Read8(addr);
-
-            add_exclusive_addr(cpu, addr);
+            add_exclusive_addr(cpu, read_addr);
             cpu->exclusive_state = 1;
 
-            cpu->Reg[BITS(inst_cream->inst, 12, 15)] = value;
-            if (BITS(inst_cream->inst, 12, 15) == 15) {
-                INC_PC(sizeof(ldst_inst));
+            RD = Memory::Read8(read_addr);
+            if (inst_cream->Rd == 15) {
+                INC_PC(sizeof(generic_arm_inst));
                 goto DISPATCH;
             }
         }
         cpu->Reg[15] += GET_INST_SIZE(cpu);
-        INC_PC(sizeof(ldst_inst));
+        INC_PC(sizeof(generic_arm_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+    LDREXH_INST:
+    {
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
+        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
+            unsigned int read_addr = RN;
+
+            add_exclusive_addr(cpu, read_addr);
+            cpu->exclusive_state = 1;
+
+            RD = Memory::Read16(read_addr);
+            if (inst_cream->Rd == 15) {
+                INC_PC(sizeof(generic_arm_inst));
+                goto DISPATCH;
+            }
+        }
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(generic_arm_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+    LDREXD_INST:
+    {
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
+        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
+            unsigned int read_addr = RN;
+
+            add_exclusive_addr(cpu, read_addr);
+            cpu->exclusive_state = 1;
+            // TODO(bunnei): Do we need to also make [read_addr + 4] exclusive?
+
+            RD = Memory::Read32(read_addr);
+            RD2 = Memory::Read32(read_addr + 4);
+
+            if (inst_cream->Rd == 15) {
+                INC_PC(sizeof(generic_arm_inst));
+                goto DISPATCH;
+            }
+        }
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(generic_arm_inst));
         FETCH_INST;
         GOTO_NEXT_INST;
     }
@@ -5762,46 +5794,96 @@ unsigned InterpreterMainLoop(ARMul_State* state) {
     }
     STREX_INST:
     {
-        ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
-        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
-            addr = cpu->Reg[BITS(inst_cream->inst, 16, 19)];
-            unsigned int value = cpu->Reg[BITS(inst_cream->inst, 0, 3)];
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
 
-            int dest_reg = BITS(inst_cream->inst, 12, 15);
-            if((exclusive_detect(cpu, addr) == 0) && (cpu->exclusive_state == 1)){
-                remove_exclusive(cpu, addr);
-                cpu->Reg[dest_reg] = 0;
+        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
+            unsigned int write_addr = cpu->Reg[inst_cream->Rn];
+
+            if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) {
+                remove_exclusive(cpu, write_addr);
                 cpu->exclusive_state = 0;
 
-                Memory::Write32(addr, value);
+                Memory::Write32(write_addr, cpu->Reg[inst_cream->Rm]);
+                RD = 0;
             } else {
                 // Failed to write due to mutex access
-                cpu->Reg[dest_reg] = 1;
+                RD = 1;
             }
         }
         cpu->Reg[15] += GET_INST_SIZE(cpu);
-        INC_PC(sizeof(ldst_inst));
+        INC_PC(sizeof(generic_arm_inst));
         FETCH_INST;
         GOTO_NEXT_INST;
     }
     STREXB_INST:
     {
-        ldst_inst *inst_cream = (ldst_inst *)inst_base->component;
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
+
         if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
-            addr = cpu->Reg[BITS(inst_cream->inst, 16, 19)];
-            unsigned int value = cpu->Reg[BITS(inst_cream->inst, 0, 3)] & 0xff;
-            int dest_reg = BITS(inst_cream->inst, 12, 15);
-            if((exclusive_detect(cpu, addr) == 0) && (cpu->exclusive_state == 1)){
-                remove_exclusive(cpu, addr);
-                cpu->Reg[dest_reg] = 0;
+            unsigned int write_addr = cpu->Reg[inst_cream->Rn];
+
+            if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) {
+                remove_exclusive(cpu, write_addr);
                 cpu->exclusive_state = 0;
-                Memory::Write8(addr, value);
+
+                Memory::Write8(write_addr, cpu->Reg[inst_cream->Rm]);
+                RD = 0;
             } else {
-                cpu->Reg[dest_reg] = 1;
+                // Failed to write due to mutex access
+                RD = 1;
             }
         }
         cpu->Reg[15] += GET_INST_SIZE(cpu);
-        INC_PC(sizeof(ldst_inst));
+        INC_PC(sizeof(generic_arm_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+    STREXD_INST:
+    {
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
+
+        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
+            unsigned int write_addr = cpu->Reg[inst_cream->Rn];
+
+            if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) {
+                remove_exclusive(cpu, write_addr);
+                cpu->exclusive_state = 0;
+                // TODO(bunnei): Remove exclusive from [write_addr + 4] if we implement this in LDREXD
+
+                Memory::Write32(write_addr, cpu->Reg[inst_cream->Rm]);
+                Memory::Write32(write_addr + 4, cpu->Reg[inst_cream->Rm + 1]);
+                RD = 0;
+            }
+            else {
+                // Failed to write due to mutex access
+                RD = 1;
+            }
+        }
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(generic_arm_inst));
+        FETCH_INST;
+        GOTO_NEXT_INST;
+    }
+    STREXH_INST:
+    {
+        generic_arm_inst* inst_cream = (generic_arm_inst*)inst_base->component;
+
+        if ((inst_base->cond == 0xe) || CondPassed(cpu, inst_base->cond)) {
+            unsigned int write_addr = cpu->Reg[inst_cream->Rn];
+
+            if ((exclusive_detect(cpu, write_addr) == 0) && (cpu->exclusive_state == 1)) {
+                remove_exclusive(cpu, write_addr);
+                cpu->exclusive_state = 0;
+
+                Memory::Write16(write_addr, cpu->Reg[inst_cream->Rm]);
+                RD = 0;
+            } else {
+                // Failed to write due to mutex access
+                RD = 1;
+            }
+        }
+        cpu->Reg[15] += GET_INST_SIZE(cpu);
+        INC_PC(sizeof(generic_arm_inst));
         FETCH_INST;
         GOTO_NEXT_INST;
     }