-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerger.py
52 lines (49 loc) · 1.35 KB
/
merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import os
from joblib import Parallel, delayed
class Merger():
pass
# with open("D:\Xina\Test\Test99\F6\Distributed_File_Systems_Concepts_and_Examples.pdf", 'rb') as src_gz, open("sink.txt", 'wb') as sink:
# chunk_size = 1024 * 1024 # 1024 * 1024 byte = 1 mb
# while True:
# chunk = src_gz.read(chunk_size)
# if not chunk:
# break
# sink.write(dec.compress(chunk))
#
# import multiprocessing as mp,os
#
# def process_wrapper(chunkStart, chunkSize):
# with open("D:\Xina\Test\Test99\F6\Distributed_File_Systems_Concepts_and_Examples.pdf") as f:
# f.seek(chunkStart)
# lines = f.read(chunkSize).splitlines()
# for line in lines:
# process(line)
#
# def chunkify(fname,size=1024*1024):
# fileEnd = os.path.getsize(fname)
# with open(fname,'r') as f:
# chunkEnd = f.tell()
# while True:
# chunkStart = chunkEnd
# f.seek(size,1)
# f.readline()
# chunkEnd = f.tell()
# yield chunkStart, chunkEnd - chunkStart
# if chunkEnd > fileEnd:
# break
#
# #init objects
# pool = mp.Pool(4)
# jobs = []
#
# #create jobs
# for chunkStart,chunkSize in chunkify("input.txt"):
# jobs.append( pool.apply_async(process_wrapper,(chunkStart,chunkSize)) )
#
# #wait for all jobs to finish
# for job in jobs:
# job.get()
#
# #clean up
# pool.close()
#