Posts

5

sudo systemctl start mongod mongosh use testDB db.students.insertMany([   { _id: 1, name: "Alice", age: 22, marks: 85 },   { _id: 2, name: "Bob", age: 21, marks: 78 },   { _id: 3, name: "Charlie", age: 23, marks: 92 },   { _id: 4, name: "David", age: 20, marks: 88 },   { _id: 5, name: "Eve", age: 22, marks: 76 } ]) db.students.countDocuments({}) db.students.find().sort({ marks: -1 }) db.students.find().sort({ marks: -1 }).limit(3) db.students.find().sort({ marks: -1 }).skip(2) db.students.aggregate([   { $group: { _id: "$age", avgMarks: { $avg: "$marks" } } } ]) db.students.find().pretty()

8

 nano wordcount_sim.py text = """ Hadoop MapReduce is a software framework for easily writing applications which process vast amounts """ def mapper(line):     words = line.strip().split()     return [(word, 1) for word in words] from collections import defaultdict # Map phase mapped = [] for line in text.strip().split('\n'):     mapped.extend(mapper(line)) # Shuffle and sort phase shuffle_sort = defaultdict(list) for word, count in mapped:     shuffle_sort[word].append(count) # Reduce phase def reducer(shuffled_data):     reduced = {}     for word, counts in shuffled_data.items():         reduced[word] = sum(counts)     return reduced word_counts = reducer(shuffle_sort) # Output for word, count in word_counts.items():     print(f"{word}\t{count}") ctrl+o python3 wordcount_sim.py (or)python wordcount_sim.py

7

 hive CREATE DATABASE employee_db; USE employee_db; CREATE TABLE employees (     emp_id INT,     name STRING,     age INT,     dept_id INT,     salary FLOAT ) ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE; LOAD DATA INPATH '/user/cloudera/employees.txt' INTO TABLE employees; ALTER TABLE employees ADD COLUMNS (email STRING); ALTER TABLE employees RENAME TO employees_new; DROP TABLE employees_new; CREATE VIEW high_salary_employees AS SELECT emp_id, name, salary FROM employees WHERE salary > 50000; ALTER VIEW high_salary_employees AS SELECT emp_id, name, age, salary FROM employees WHERE salary > 60000; DROP VIEW high_salary_employees; SHOW TABLES; DESCRIBE employees; SELECT * FROM employees LIMIT 5; -=

6

 pig -x local students = LOAD '/home/cloudera/Desktop/students.txt' USING PigStorage(',')  AS (id:int, name:chararray, dept:chararray, marks:int); departments = LOAD '/home/cloudera/Desktop/departments.txt' USING PigStorage(',')  AS (dept_code:chararray, dept_name:chararray); high_scorers = FILTER students BY marks > 70; DUMP high_scorers; projected = FOREACH high_scorers GENERATE name, marks; DUMP projected; grouped_by_dept = GROUP students BY dept; DUMP grouped_by_dept; avg_marks = FOREACH grouped_by_dept GENERATE group AS department, AVG(students.marks) AS average_marks; DUMP avg_marks; joined = JOIN students BY dept, departments BY dept_code; DUMP joined; sorted = ORDER students BY marks DESC; DUMP sorted;

1

 mkdir Hadoop cd Hadoop vi hello.txt (Welcome to Big Data) Esc+ :wq  cat hello.txt hdfs dfs -mkdir /today hdfs dfs -put hello.txt /today/data.txt hdfs dfs -mkdir /input hdfs dfs -cp /today/data.txt /input/data.txt hdfs dfs -cat /input/data.txt hdfs dfs -get /input/data.txt hdfs dfs -rm /today/data.txt hdfs dfs -rmdir /today

4 (jupiter)

 import csv def mapper(tags): return [(r['movieId'], r['tag']) for r in tags] def reducer(vals): return list(set(vals)) def map_reduce(tags):     g = {}     for k,v in mapper(tags): g.setdefault(k, []).append(v)     return {k: reducer(v) for k,v in g.items()} def load_csv(f): return list(csv.DictReader(open(f))) def get_titles(rows): return {r['movieId']: r['title'] for r in rows} tags = load_csv('tags.csv') movies = get_titles(load_csv('movies.csv')) for m,t in map_reduce(tags).items():     print(f"Movie: {movies[m]}, Tags: {', '.join(t)}") ..................................................................................................................... def mapper(tags):      return [(r['movieId'], r['tag']) for r in tags] def reducer(values):      return list(set(values)) def map_reduce(tags):     grouped = {}     for k, v in mapper(tags):          grouped.setdefa...

3 (jupiter)

 def mapper(line):     parts = line.strip().split(',')     return parts if len(parts) == 2 else None def reducer(conditions):     return max(set(conditions), key=conditions.count) def main(file):     with open(file, encoding='utf-8') as f:         data = [mapper(line) for line in f if mapper(line)]     grouped = {}     for date, cond in data:         grouped.setdefault(date, []).append(cond)     for date in grouped:         print(f"Weather on {date}: {reducer(grouped[date])}") # Example usage if __name__ == "__main__":     main('weather.txt') .................................................................................. def mapper(l): return tuple(l.strip().split(',')) def reducer(c): return max(set(c), key=c.count) def main(data):     g = {}     for d, c in map(mapper, data): g.setdefault(d, []).append(c)     ...