/*
 * eMMC BIS driver for Nintendo Switch
 *
 * Copyright (c) 2019 shchmue
 * Copyright (c) 2019-2020 CTCaer
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms and conditions of the GNU General Public License,
 * version 2, as published by the Free Software Foundation.
 *
 * This program is distributed in the hope it will be useful, but WITHOUT
 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
 * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
 * more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */

#include <string.h>

#include <memory_map.h>

#include <mem/heap.h>
#include <sec/se.h>
#include "../storage/nx_emmc.h"
#include "nx_emmc_bis.h"
#include <storage/sdmmc.h>
#include <utils/types.h>

#define MAX_CLUSTER_CACHE_ENTRIES 32768
#define CLUSTER_LOOKUP_EMPTY_ENTRY 0xFFFFFFFF
#define SECTORS_PER_CLUSTER 0x20

typedef struct
{
	u32 cluster_num;                // index of the cluster in the partition
	u32 visit_count;                // used for debugging/access analysis
	u8  dirty;                      // has been modified without writeback flag
	u8  align[7];
	u8  cluster[XTS_CLUSTER_SIZE];  // the cached cluster itself
} cluster_cache_t;

typedef struct
{
	u8 emmc_buffer[XTS_CLUSTER_SIZE];
	cluster_cache_t cluster_cache[];
} bis_cache_t;

static u8 ks_crypt = 0;
static u8 ks_tweak = 0;
static u8 cache_filled = 0;
static u32 dirty_cluster_count = 0;
static u32 cluster_cache_end_index = 0;
static emmc_part_t *system_part = NULL;
static bis_cache_t *bis_cache = (bis_cache_t *)NX_BIS_CACHE_ADDR;
static u32 *cluster_lookup_buf = NULL;
static u32 *cluster_lookup = NULL;
static bool lock_cluster_cache = false;

static void _gf256_mul_x_le(void *block)
{
	u32 *pdata = (u32 *)block;
	u32 carry = 0;

	for (u32 i = 0; i < 4; i++)
	{
		u32 b = pdata[i];
		pdata[i] = (b << 1) | carry;
		carry = b >> 31;
	}

	if (carry)
		pdata[0x0] ^= 0x87;
}

static int _nx_aes_xts_crypt_sec(u32 tweak_ks, u32 crypt_ks, u32 enc, u8 *tweak, bool regen_tweak, u32 tweak_exp, u32 sec, void *dst, const void *src, u32 sec_size)
{
	u32 *pdst = (u32 *)dst;
	u32 *psrc = (u32 *)src;
	u32 *ptweak = (u32 *)tweak;

	if (regen_tweak)
	{
		for (int i = 0xF; i >= 0; i--)
		{
			tweak[i] = sec & 0xFF;
			sec >>= 8;
		}
		if (!se_aes_crypt_block_ecb(tweak_ks, 1, tweak, tweak))
			return 0;
	}

	// tweak_exp allows us to use a saved tweak to reduce _gf256_mul_x_le calls.
	for (u32 i = 0; i < (tweak_exp << 5); i++)
		_gf256_mul_x_le(tweak);

	u8 orig_tweak[0x10] __attribute__((aligned(4)));
	memcpy(orig_tweak, tweak, 0x10);

	// We are assuming a 0x10-aligned sector size in this implementation.
	for (u32 i = 0; i < (sec_size >> 4); i++)
	{
		for (u32 j = 0; j < 4; j++)
			pdst[j] = psrc[j] ^ ptweak[j];

		_gf256_mul_x_le(tweak);
		psrc += 4;
		pdst += 4;
	}

	if (!se_aes_crypt_ecb(crypt_ks, enc, dst, sec_size, dst, sec_size))
		return 0;

	pdst = (u32 *)dst;
	ptweak = (u32 *)orig_tweak;
	for (u32 i = 0; i < (sec_size >> 4); i++)
	{
		for (u32 j = 0; j < 4; j++)
			pdst[j] = pdst[j] ^ ptweak[j];

		_gf256_mul_x_le(orig_tweak);
		pdst += 4;
	}

	return 1;
}

static int nx_emmc_bis_write_block(u32 sector, u32 count, void *buff, bool force_flush)
{
	if (!system_part)
		return 3; // Not ready.

	u8 tweak[0x10] __attribute__((aligned(4)));
	u32 cluster = sector / SECTORS_PER_CLUSTER;
	u32 aligned_sector = cluster * SECTORS_PER_CLUSTER;
	u32 sector_index_in_cluster = sector % SECTORS_PER_CLUSTER;
	u32 cluster_lookup_index = cluster_lookup[cluster];
	bool is_cached = cluster_lookup_index != CLUSTER_LOOKUP_EMPTY_ENTRY;

	// Write to cached cluster.
	if (is_cached)
	{
		if (buff)
			memcpy(bis_cache->cluster_cache[cluster_lookup_index].cluster + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, buff, count * NX_EMMC_BLOCKSIZE);
		else
			buff = bis_cache->cluster_cache[cluster_lookup_index].cluster;
		bis_cache->cluster_cache[cluster_lookup_index].visit_count++;
		if (bis_cache->cluster_cache[cluster_lookup_index].dirty == 0)
			dirty_cluster_count++;
		bis_cache->cluster_cache[cluster_lookup_index].dirty = 1;
		if (!force_flush)
			return 0; // Success.

		// Reset args to trigger a full cluster flush to emmc.
		sector_index_in_cluster = 0;
		sector = aligned_sector;
		count = SECTORS_PER_CLUSTER;
	}

	// Encrypt and write.
	if (!_nx_aes_xts_crypt_sec(ks_tweak, ks_crypt, 1, tweak, true, sector_index_in_cluster, cluster, bis_cache->emmc_buffer, buff, count * NX_EMMC_BLOCKSIZE) ||
		!nx_emmc_part_write(&emmc_storage, system_part, sector, count, bis_cache->emmc_buffer)
	)
		return 1; // R/W error.

	// Mark cache entry not dirty if write succeeds.
	if (is_cached)
	{
		bis_cache->cluster_cache[cluster_lookup_index].dirty = 0;
		dirty_cluster_count--;
	}

	return 0; // Success.
}

static void _nx_emmc_bis_flush_cluster(cluster_cache_t *cache_entry)
{
	nx_emmc_bis_write_block(cache_entry->cluster_num * SECTORS_PER_CLUSTER, SECTORS_PER_CLUSTER, NULL, true);
}

static int nx_emmc_bis_read_block(u32 sector, u32 count, void *buff)
{
	if (!system_part)
		return 3; // Not ready.

	static u32 prev_cluster = -1;
	static u32 prev_sector = 0;
	static u8 tweak[0x10] __attribute__((aligned(4)));
	u8 cache_tweak[0x10] __attribute__((aligned(4)));

	u32 tweak_exp = 0;
	bool regen_tweak = true;

	u32 cluster = sector / SECTORS_PER_CLUSTER;
	u32 aligned_sector = cluster * SECTORS_PER_CLUSTER;
	u32 sector_index_in_cluster = sector % SECTORS_PER_CLUSTER;
	u32 cluster_lookup_index = cluster_lookup[cluster];

	// Read from cached cluster.
	if (cluster_lookup_index != CLUSTER_LOOKUP_EMPTY_ENTRY)
	{
		memcpy(buff, bis_cache->cluster_cache[cluster_lookup_index].cluster + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, count * NX_EMMC_BLOCKSIZE);
		bis_cache->cluster_cache[cluster_lookup_index].visit_count++;
		prev_sector = sector + count - 1;
		prev_cluster = cluster;
		return 0; // Success.
	}

	// Cache cluster.
	if (!lock_cluster_cache)
	{
		// Roll the cache index over and flush if full.
		if (cluster_cache_end_index >= MAX_CLUSTER_CACHE_ENTRIES)
		{
			cluster_cache_end_index = 0;
			cache_filled = 1;
		}
		// Check if cache entry was previously in use in case of cache loop.
		if (cache_filled == 1 && bis_cache->cluster_cache[cluster_cache_end_index].dirty == 1)
			_nx_emmc_bis_flush_cluster(&bis_cache->cluster_cache[cluster_cache_end_index]);
		bis_cache->cluster_cache[cluster_cache_end_index].cluster_num = cluster;
		bis_cache->cluster_cache[cluster_cache_end_index].visit_count = 1;
		bis_cache->cluster_cache[cluster_cache_end_index].dirty = 0;
		cluster_lookup[cluster] = cluster_cache_end_index;

		// Read and decrypt the whole cluster the sector resides in.
		if (!nx_emmc_part_read(&emmc_storage, system_part, aligned_sector, SECTORS_PER_CLUSTER, bis_cache->emmc_buffer) ||
			!_nx_aes_xts_crypt_sec(ks_tweak, ks_crypt, 0, cache_tweak, true, 0, cluster, bis_cache->emmc_buffer, bis_cache->emmc_buffer, XTS_CLUSTER_SIZE)
		)
			return 1; // R/W error.

		// Copy to cluster cache.
		memcpy(bis_cache->cluster_cache[cluster_cache_end_index].cluster, bis_cache->emmc_buffer, XTS_CLUSTER_SIZE);
		memcpy(buff, bis_cache->emmc_buffer + sector_index_in_cluster * NX_EMMC_BLOCKSIZE, count * NX_EMMC_BLOCKSIZE);
		cluster_cache_end_index++;
		return 0; // Success.
	}

	// If not reading from or writing to cache, do a regular read and decrypt.
	if (!nx_emmc_part_read(&emmc_storage, system_part, sector, count, bis_cache->emmc_buffer))
		return 1; // R/W error.

	if (prev_cluster != cluster) // Sector in different cluster than last read.
	{
		prev_cluster = cluster;
		tweak_exp = sector_index_in_cluster;
	}
	else if (sector > prev_sector) // Sector in same cluster and past last sector.
	{
		// Calculates the new tweak using the saved one, reducing expensive _gf256_mul_x_le calls.
		tweak_exp = sector - prev_sector - 1;
		regen_tweak = false;
	}
	else // Sector in same cluster and before or same as last sector.
		tweak_exp = sector_index_in_cluster;

	// Maximum one cluster (1 XTS crypto block 16KB).
	if (!_nx_aes_xts_crypt_sec(ks_tweak, ks_crypt, 0, tweak, regen_tweak, tweak_exp, prev_cluster, buff, bis_cache->emmc_buffer, count * NX_EMMC_BLOCKSIZE))
		return 1; // R/W error.
	prev_sector = sector + count - 1;

	return 0; // Success.
}

int nx_emmc_bis_read(u32 sector, u32 count, void *buff)
{
	int res = 1;
	u8 *buf = (u8 *)buff;
	u32 curr_sct = sector;

	while (count)
	{
		u32 sct_cnt = MIN(count, 0x20);
		res = nx_emmc_bis_read_block(curr_sct, sct_cnt, buf);
		if (res)
			return 1;

		count -= sct_cnt;
		curr_sct += sct_cnt;
		buf += NX_EMMC_BLOCKSIZE * sct_cnt;
	}

	return res;
}

int nx_emmc_bis_write(u32 sector, u32 count, void *buff)
{
	int res = 1;
	u8 *buf = (u8 *)buff;
	u32 curr_sct = sector;

	while (count)
	{
		u32 sct_cnt = MIN(count, 0x20);
		res = nx_emmc_bis_write_block(curr_sct, sct_cnt, buf, false);
		if (res)
			return 1;

		count -= sct_cnt;
		curr_sct += sct_cnt;
		buf += NX_EMMC_BLOCKSIZE * sct_cnt;
	}

	return res;
}

void nx_emmc_bis_cluster_cache_init()
{
	u32 cluster_lookup_size = (system_part->lba_end - system_part->lba_start + 1) / SECTORS_PER_CLUSTER * sizeof(*cluster_lookup);

	if (cluster_lookup_buf)
		free(cluster_lookup_buf);

	// Check if carveout protected, in case of old hwinit (pre 4.0.0) chainload.
	*(vu32 *)NX_BIS_LOOKUP_ADR = 0;
	if (*(vu32 *)NX_BIS_LOOKUP_ADR != 0)
	{
		cluster_lookup_buf = (u32 *)malloc(cluster_lookup_size + 0x2000);
		cluster_lookup = (u32 *)ALIGN((u32)cluster_lookup_buf, 0x1000);
	}
	else
	{
		cluster_lookup_buf = NULL;
		cluster_lookup = (u32 *)NX_BIS_LOOKUP_ADR;
	}

	// Clear cluster lookup table and reset end index.
	memset(cluster_lookup, -1, cluster_lookup_size);
	cluster_cache_end_index = 0;
	lock_cluster_cache = false;

	dirty_cluster_count = 0;
	cache_filled = 0;
}

void nx_emmc_bis_init(emmc_part_t *part)
{
	system_part = part;

	nx_emmc_bis_cluster_cache_init();

	switch (part->index)
	{
	case 0:  // PRODINFO.
	case 1:  // PRODINFOF.
		ks_crypt = 0;
		ks_tweak = 1;
		break;
	case 8:  // SAFE.
		ks_crypt = 2;
		ks_tweak = 3;
		break;
	case 9:  // SYSTEM.
	case 10: // USER.
		ks_crypt = 4;
		ks_tweak = 5;
		break;
	}
}

void nx_emmc_bis_finalize()
{
	if (dirty_cluster_count == 0)
		return;

	u32 limit = cache_filled == 1 ? MAX_CLUSTER_CACHE_ENTRIES : cluster_cache_end_index;
	u32 clusters_to_flush = dirty_cluster_count;
	for (u32 i = 0; i < limit && clusters_to_flush; i++)
	{
		if (bis_cache->cluster_cache[i].dirty) {
			_nx_emmc_bis_flush_cluster(&bis_cache->cluster_cache[i]);
			clusters_to_flush--;
		}
	}
}

// Set cluster cache lock according to arg.
void nx_emmc_bis_cache_lock(bool lock)
{
	lock_cluster_cache = lock;
}