mirror of
https://github.com/reddit-archive/reddit.git
synced 2026-01-14 17:38:04 -05:00
Thanks to Nathanael A. Hoyle for the report! Some of these may have been exploitable due to pointer arithmetic before reads / writes. Just bail out if we can't allocate.
140 lines
3.9 KiB
C
140 lines
3.9 KiB
C
#include <stdio.h>
|
|
#include <string.h>
|
|
#include <assert.h>
|
|
#include <stdint.h>
|
|
#include <inttypes.h>
|
|
|
|
#include <arpa/inet.h>
|
|
|
|
#include <pcre.h>
|
|
#include <zlib.h>
|
|
|
|
#define MAX_LINE 2048
|
|
|
|
/******************************
|
|
* this regular expression has the following capture groups in it (in order):
|
|
* - ip
|
|
* - path
|
|
* - query
|
|
* - user agent
|
|
******************************/
|
|
#define RE "(?:[0-9.]+,\\ )*([0-9.]+)"\
|
|
"[^\"]+"\
|
|
"\"GET\\s([^\\s?]+)\\?([^\\s]+)\\s[^\"]+\""\
|
|
"\\s([^\\s]+)[^\"]+"\
|
|
"\"[^\"]+\""\
|
|
"[^\"]+"\
|
|
"\"([^\"]+)\""
|
|
|
|
#define GROUP_IP 1
|
|
#define GROUP_PATH 2
|
|
#define GROUP_QUERY 3
|
|
#define GROUP_CODE 4
|
|
#define GROUP_UA 5
|
|
|
|
int main(int argc, char** argv)
|
|
{
|
|
/* compile the pattern */
|
|
const char* error;
|
|
int error_offset;
|
|
pcre *re = pcre_compile(RE, 0, &error, &error_offset, NULL);
|
|
if (re == NULL) {
|
|
fprintf(
|
|
stderr,
|
|
"character %d: failed to compile regex: %s\n",
|
|
error_offset,
|
|
error
|
|
);
|
|
|
|
return 1;
|
|
}
|
|
|
|
/* study it to speed it up */
|
|
pcre_extra *extra = pcre_study(re, 0, &error);
|
|
|
|
/* allocate enough space for the capturing groups */
|
|
int group_count;
|
|
pcre_fullinfo(re, extra, PCRE_INFO_CAPTURECOUNT, &group_count);
|
|
int match_vector_size = (group_count + 1) * 3;
|
|
int *matches = malloc(sizeof(int) * match_vector_size);
|
|
if (matches == NULL) {
|
|
fprintf(stderr, "Couldn't allocate memory for regex groups!\n");
|
|
return 1;
|
|
}
|
|
|
|
/* iterate through the input */
|
|
char input_line[MAX_LINE];
|
|
while (fgets(input_line, MAX_LINE, stdin)) {
|
|
int length = strlen(input_line);
|
|
|
|
/* run the regular expression against the line */
|
|
int match_result = pcre_exec(
|
|
re,
|
|
extra,
|
|
input_line,
|
|
length,
|
|
0,
|
|
0,
|
|
matches,
|
|
match_vector_size
|
|
);
|
|
|
|
/* bail out if the line didn't match */
|
|
if (match_result < 0) {
|
|
continue;
|
|
}
|
|
|
|
/* iterate through the groups */
|
|
/* NOTE: the crc function uses int32_t instead of uint32_t
|
|
* and has the funky (2^31 - crc) bit of math for backwards
|
|
* compatibility with the old python code. fix this when
|
|
* such compatibility is no longer necessary. */
|
|
uint32_t address = 0;
|
|
int32_t crc;
|
|
uint64_t unique_id;
|
|
|
|
for (int i = 1; i < match_result; i++) {
|
|
int start_position = matches[i * 2];
|
|
int end_position = matches[i * 2 + 1];
|
|
int substr_length = end_position - start_position;
|
|
|
|
switch (i) {
|
|
case GROUP_UA:
|
|
crc = crc32(0L, Z_NULL, 0);
|
|
crc = crc32(crc, (unsigned char*)input_line + start_position,
|
|
substr_length);
|
|
unique_id = (((uint64_t)address << 32) & 0xffffffff00000000) |
|
|
(2147483648 - crc);
|
|
fprintf(stdout, "%" PRIu64, unique_id);
|
|
break;
|
|
case GROUP_IP:
|
|
/* parse and store the ip so we can use it in GROUP_UA
|
|
* to calculate the unique id */
|
|
input_line[end_position] = 0;
|
|
address = inet_addr(input_line + start_position);
|
|
|
|
/* fall through so it gets written out as well */
|
|
case GROUP_PATH:
|
|
case GROUP_CODE:
|
|
case GROUP_QUERY:
|
|
/* write them out verbatim */
|
|
(void)fwrite(
|
|
input_line + start_position,
|
|
sizeof(char),
|
|
substr_length,
|
|
stdout
|
|
);
|
|
break;
|
|
}
|
|
|
|
/* tab-delimit the data */
|
|
fputc('\t', stdout);
|
|
}
|
|
|
|
fputc('\n', stdout);
|
|
}
|
|
|
|
return 0;
|
|
}
|
|
|