diff --git a/src/citra/citra.cpp b/src/citra/citra.cpp
index 9399ff296..7dc721dc3 100644
--- a/src/citra/citra.cpp
+++ b/src/citra/citra.cpp
@@ -31,7 +31,9 @@ int __cdecl main(int argc, char **argv) {
         return -1;
     }
 
-    Core::RunLoop();
+    while(true) {
+        Core::RunLoop();
+    }
 
     delete emu_window;
 
diff --git a/src/core/core.cpp b/src/core/core.cpp
index fc9909377..f21801e52 100644
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@@ -6,6 +6,8 @@
 #include "common/log.h"
 #include "common/symbols.h"
 
+#include "video_core/video_core.h"
+
 #include "core/core.h"
 #include "core/mem_map.h"
 #include "core/hw/hw.h"
@@ -24,29 +26,17 @@ ARM_Interface*  g_app_core      = nullptr;  ///< ARM11 application core
 ARM_Interface*  g_sys_core      = nullptr;  ///< ARM11 system (OS) core
 
 /// Run the core CPU loop
-void RunLoop() {
-    for (;;){
-        // This function loops for 100 instructions in the CPU before trying to update hardware.
-        // This is a little bit faster than SingleStep, and should be pretty much equivalent. The 
-        // number of instructions chosen is fairly arbitrary, however a large number will more 
-        // drastically affect the frequency of GSP interrupts and likely break things. The point of
-        // this is to just loop in the CPU for more than 1 instruction to reduce overhead and make
-        // it a little bit faster...
-        g_app_core->Run(100);
-        HW::Update();
-        if (HLE::g_reschedule) {
-            Kernel::Reschedule();
-        }
+void RunLoop(int tight_loop) {
+    g_app_core->Run(tight_loop);
+    HW::Update();
+    if (HLE::g_reschedule) {
+        Kernel::Reschedule();
     }
 }
 
 /// Step the CPU one instruction
 void SingleStep() {
-    g_app_core->Step();
-    HW::Update();
-    if (HLE::g_reschedule) {
-        Kernel::Reschedule();
-    }
+    RunLoop(1);
 }
 
 /// Halt the core
diff --git a/src/core/core.h b/src/core/core.h
index 4b42dabcb..9c72c8b3f 100644
--- a/src/core/core.h
+++ b/src/core/core.h
@@ -19,8 +19,15 @@ extern ARM_Interface*   g_sys_core;     ///< ARM11 system (OS) core
 /// Start the core
 void Start();
 
-/// Run the core CPU loop
-void RunLoop();
+/**
+ * Run the core CPU loop
+ * This function loops for 100 instructions in the CPU before trying to update hardware. This is a
+ * little bit faster than SingleStep, and should be pretty much equivalent. The number of
+ * instructions chosen is fairly arbitrary, however a large number will more drastically affect the
+ * frequency of GSP interrupts and likely break things. The point of this is to just loop in the CPU
+ * for more than 1 instruction to reduce overhead and make it a little bit faster...
+ */
+void RunLoop(int tight_loop=100);
 
 /// Step the CPU one instruction
 void SingleStep();
diff --git a/src/core/hw/gpu.cpp b/src/core/hw/gpu.cpp
index f1f3e7ab3..8709b8eb7 100644
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@@ -24,6 +24,7 @@ Regs g_regs;
 
 u32 g_cur_line = 0;         ///< Current vertical screen line
 u64 g_last_line_ticks = 0;  ///< CPU tick count from last vertical screen line
+u64 g_last_frame_ticks = 0; ///< CPU tick count from last frame
 
 template <typename T>
 inline void Read(T &var, const u32 raw_addr) {
@@ -179,27 +180,44 @@ void Update() {
     auto& framebuffer_top = g_regs.framebuffer_config[0];
     u64 current_ticks = Core::g_app_core->GetTicks();
 
-    // Synchronize line...
-    if ((current_ticks - g_last_line_ticks) >= GPU::kFrameTicks / framebuffer_top.height) {
-        GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PDC0);
-        g_cur_line++;
-        g_last_line_ticks = current_ticks;
+    // Update the frame after a certain number of CPU ticks have elapsed. This assumes that the
+    // active frame in memory is always complete to render. There also may be issues with this
+    // becoming out-of-synch with GSP synchrinization code (as follows). At this time, this seems to
+    // be the most effective solution for both homebrew and retail applications. With retail, this
+    // could be moved below (and probably would guarantee more accurate synchronization). However,
+    // primitive homebrew relies on a vertical blank interrupt to happen inevitably (regardless of a
+    // threading reschedule).
+
+    if ((current_ticks - g_last_frame_ticks) > GPU::kFrameTicks) {
+        VideoCore::g_renderer->SwapBuffers();
+        g_last_frame_ticks = current_ticks;
     }
 
-    // Synchronize frame...
-    if (g_cur_line >= framebuffer_top.height) {
-        g_cur_line = 0;
-        GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PDC1);
-        VideoCore::g_renderer->SwapBuffers();
-        Kernel::WaitCurrentThread(WAITTYPE_VBLANK);
-        HLE::Reschedule(__func__);
+    // Synchronize GPU on a thread reschedule: Because we cannot accurately predict a vertical
+    // blank, we need to simulate it. Based on testing, it seems that retail applications work more
+    // accurately when this is signalled between thread switches.
+
+    if (HLE::g_reschedule) {
+
+        // Synchronize line...
+        if ((current_ticks - g_last_line_ticks) >= GPU::kFrameTicks / framebuffer_top.height) {
+            GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PDC0);
+            g_cur_line++;
+            g_last_line_ticks = current_ticks;
+        }
+
+        // Synchronize frame...
+        if (g_cur_line >= framebuffer_top.height) {
+            g_cur_line = 0;
+            GSP_GPU::SignalInterrupt(GSP_GPU::InterruptId::PDC1);
+        }
     }
 }
 
 /// Initialize hardware
 void Init() {
     g_cur_line = 0;
-    g_last_line_ticks = Core::g_app_core->GetTicks();
+    g_last_frame_ticks = g_last_line_ticks = Core::g_app_core->GetTicks();
 
     auto& framebuffer_top = g_regs.framebuffer_config[0];
     auto& framebuffer_sub = g_regs.framebuffer_config[1];