1#pragma once 2 3#include <string> 4#include <fcntl.h> 5 6class SegmentInput 7{ 8 public: 9 explicit SegmentInput(const char* filename, int bufsize=kBufferSize) 10 : filename_(filename), 11 fd_(::open(filename, O_RDONLY)), 12 buffer_size_(bufsize), 13 data_(new char[buffer_size_]) 14 { 15 refill(); 16 } 17 18 const std::string& filename() const { return filename_; } 19 int64_t tell() const { return offset_; } 20 const std::string& current_word() const { return word_; } 21 int64_t current_count() const { return count_; } 22 23 bool next() 24 { 25 if (avail_ <= 0) 26 return false; 27 char* nl = static_cast<char*>(::memchr(start_, '\n', avail_)); 28 if (nl) 29 { 30 char* tab = static_cast<char*>(::memchr(start_, '\t', nl - start_)); 31 if (tab) 32 { 33 count_ = strtol(tab+1, NULL, 10); 34 word_ = std::string_view(start_, tab-start_); 35 36 int len = nl - start_ + 1; 37 avail_ -= len; 38 offset_ += len; 39 assert(avail_ >= 0); 40 if (avail_ == 0) 41 { 42 refill(); 43 } 44 else 45 { 46 start_ += len; 47 } 48 return true; 49 } 50 else 51 { 52 avail_ = 0; 53 assert(0); 54 return false; 55 } 56 } 57 else 58 { 59 refill(); 60 return next(); 61 } 62 } 63 64 private: 65 void refill() 66 { 67 start_ = data_.get(); 68 avail_ = ::pread(fd_, start_, buffer_size_, offset_); 69 } 70 71 const std::string filename_; 72 const int fd_; 73 const int buffer_size_; 74 int64_t offset_ = 0; // file position 75 76 char* start_ = nullptr; 77 int avail_ = 0; 78 std::unique_ptr<char[]> data_; 79 80 std::string word_; 81 int64_t count_ = 0; 82 83 SegmentInput(const SegmentInput&) = delete; 84 void operator=(const SegmentInput&) = delete; 85}; 86 87 88