hadoop-streaming
https://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-
python/
word mapper
hadoop mapreduce mapper reducer
mapper.py
reducer.py
#!/usr/bin/env python
import sys
for line in sys.stdin:
line = line.strip()
keys = line.split(,)[-1].split(;)
for key in keys:
key = key.lower().strip()
value = 1
print( %st%d % (key, value) )
#!/usr/bin/env python
import sys
last_key = None
running_total = 0
for input_line in sys.stdin:
try:
input_line = input_line.strip()
# print(input_line)
this_key, value = input_line.split(t, 1)
value = int(value)
if last_key == this_key:
running_total += value
else:
if last_key:
# print(%st%d % (last_key, running_total))
https://www.michael-noll.com/tutorials/writing-an-hadoop-mapreduce-program-in-python
print(%dt%s % (running_total, last_key))
running_total = value
last_key = this_key
except Exception as e:
# print(e)
pass
if last_key == this_key:
# print(%st%d % (last_key, running_total))
print(%dt%s % (running_total, last_key))
Reviews
There are no reviews yet.