alfaaz

Posts

8

May 21, 2025

nano wordcount_sim.py text = """ Hadoop MapReduce is a software framework for easily writing applications which process vast amounts """ def mapper(line): words = line.strip().split() return [(word, 1) for word in words] from collections import defaultdict # Map phase mapped = [] for line in text.strip().split('\n'): mapped.extend(mapper(line)) # Shuffle and sort phase shuffle_sort = defaultdict(list) for word, count in mapped: shuffle_sort[word].append(count) # Reduce phase def reducer(shuffled_data): reduced = {} for word, counts in shuffled_data.items(): reduced[word] = sum(counts) return reduced word_counts = reducer(shuffle_sort) # Output for word, count in word_counts.items(): print(f"{word}\t{count}") ctrl+o python3 wordcount_sim.py (or)python wordcount_sim.py

7

May 21, 2025

hive CREATE DATABASE employee_db; USE employee_db; CREATE TABLE employees ( emp_id INT, name STRING, age INT, dept_id INT, salary FLOAT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; LOAD DATA INPATH '/user/cloudera/employees.txt' INTO TABLE employees; ALTER TABLE employees ADD COLUMNS (email STRING); ALTER TABLE employees RENAME TO employees_new; DROP TABLE employees_new; CREATE VIEW high_salary_employees AS SELECT emp_id, name, salary FROM employees WHERE salary > 50000; ALTER VIEW high_salary_employees AS SELECT emp_id, name, age, salary FROM employees WHERE salary > 60000; DROP VIEW high_salary_employees; SHOW TABLES; DESCRIBE employees; SELECT * FROM employees LIMIT 5; -=

6

May 21, 2025

pig -x local students = LOAD '/home/cloudera/Desktop/students.txt' USING PigStorage(',') AS (id:int, name:chararray, dept:chararray, marks:int); departments = LOAD '/home/cloudera/Desktop/departments.txt' USING PigStorage(',') AS (dept_code:chararray, dept_name:chararray); high_scorers = FILTER students BY marks > 70; DUMP high_scorers; projected = FOREACH high_scorers GENERATE name, marks; DUMP projected; grouped_by_dept = GROUP students BY dept; DUMP grouped_by_dept; avg_marks = FOREACH grouped_by_dept GENERATE group AS department, AVG(students.marks) AS average_marks; DUMP avg_marks; joined = JOIN students BY dept, departments BY dept_code; DUMP joined; sorted = ORDER students BY marks DESC; DUMP sorted;

1

May 21, 2025

mkdir Hadoop cd Hadoop vi hello.txt (Welcome to Big Data) Esc+ :wq cat hello.txt hdfs dfs -mkdir /today hdfs dfs -put hello.txt /today/data.txt hdfs dfs -mkdir /input hdfs dfs -cp /today/data.txt /input/data.txt hdfs dfs -cat /input/data.txt hdfs dfs -get /input/data.txt hdfs dfs -rm /today/data.txt hdfs dfs -rmdir /today

4 (jupiter)

May 21, 2025

import csv def mapper(tags): return [(r['movieId'], r['tag']) for r in tags] def reducer(vals): return list(set(vals)) def map_reduce(tags): g = {} for k,v in mapper(tags): g.setdefault(k, []).append(v) return {k: reducer(v) for k,v in g.items()} def load_csv(f): return list(csv.DictReader(open(f))) def get_titles(rows): return {r['movieId']: r['title'] for r in rows} tags = load_csv('tags.csv') movies = get_titles(load_csv('movies.csv')) for m,t in map_reduce(tags).items(): print(f"Movie: {movies[m]}, Tags: {', '.join(t)}") ..................................................................................................................... def mapper(tags): return [(r['movieId'], r['tag']) for r in tags] def reducer(values): return list(set(values)) def map_reduce(tags): grouped = {} for k, v in mapper(tags): grouped.setdefa...

3 (jupiter)

May 21, 2025

def mapper(line): parts = line.strip().split(',') return parts if len(parts) == 2 else None def reducer(conditions): return max(set(conditions), key=conditions.count) def main(file): with open(file, encoding='utf-8') as f: data = [mapper(line) for line in f if mapper(line)] grouped = {} for date, cond in data: grouped.setdefault(date, []).append(cond) for date in grouped: print(f"Weather on {date}: {reducer(grouped[date])}") # Example usage if __name__ == "__main__": main('weather.txt') .................................................................................. def mapper(l): return tuple(l.strip().split(',')) def reducer(c): return max(set(c), key=c.count) def main(data): g = {} for d, c in map(mapper, data): g.setdefault(d, []).append(c) ...