input.h revision da39c979
1da39c979SShuo Chen#pragma once 2da39c979SShuo Chen 3da39c979SShuo Chen#include <string> 4da39c979SShuo Chen#include <fcntl.h> 5da39c979SShuo Chen 6da39c979SShuo Chenclass SegmentInput 7da39c979SShuo Chen{ 8da39c979SShuo Chen public: 9da39c979SShuo Chen explicit SegmentInput(const char* filename, int bufsize=kBufferSize) 10da39c979SShuo Chen : filename_(filename), 11da39c979SShuo Chen fd_(::open(filename, O_RDONLY)), 12da39c979SShuo Chen buffer_size_(bufsize), 13da39c979SShuo Chen data_(new char[buffer_size_]) 14da39c979SShuo Chen { 15da39c979SShuo Chen refill(); 16da39c979SShuo Chen } 17da39c979SShuo Chen 18da39c979SShuo Chen const std::string& filename() const { return filename_; } 19da39c979SShuo Chen int64_t tell() const { return offset_; } 20da39c979SShuo Chen const std::string& current_word() const { return word_; } 21da39c979SShuo Chen int64_t current_count() const { return count_; } 22da39c979SShuo Chen 23da39c979SShuo Chen bool next() 24da39c979SShuo Chen { 25da39c979SShuo Chen if (avail_ <= 0) 26da39c979SShuo Chen return false; 27da39c979SShuo Chen char* nl = static_cast<char*>(::memchr(start_, '\n', avail_)); 28da39c979SShuo Chen if (nl) 29da39c979SShuo Chen { 30da39c979SShuo Chen char* tab = static_cast<char*>(::memchr(start_, '\t', nl - start_)); 31da39c979SShuo Chen if (tab) 32da39c979SShuo Chen { 33da39c979SShuo Chen count_ = strtol(tab+1, NULL, 10); 34da39c979SShuo Chen word_ = std::string_view(start_, tab-start_); 35da39c979SShuo Chen 36da39c979SShuo Chen int len = nl - start_ + 1; 37da39c979SShuo Chen avail_ -= len; 38da39c979SShuo Chen offset_ += len; 39da39c979SShuo Chen assert(avail_ >= 0); 40da39c979SShuo Chen if (avail_ == 0) 41da39c979SShuo Chen { 42da39c979SShuo Chen refill(); 43da39c979SShuo Chen } 44da39c979SShuo Chen else 45da39c979SShuo Chen { 46da39c979SShuo Chen start_ += len; 47da39c979SShuo Chen } 48da39c979SShuo Chen return true; 49da39c979SShuo Chen } 50da39c979SShuo Chen else 51da39c979SShuo Chen { 52da39c979SShuo Chen avail_ = 0; 53da39c979SShuo Chen assert(0); 54da39c979SShuo Chen return false; 55da39c979SShuo Chen } 56da39c979SShuo Chen } 57da39c979SShuo Chen else 58da39c979SShuo Chen { 59da39c979SShuo Chen refill(); 60da39c979SShuo Chen return next(); 61da39c979SShuo Chen } 62da39c979SShuo Chen } 63da39c979SShuo Chen 64da39c979SShuo Chen private: 65da39c979SShuo Chen void refill() 66da39c979SShuo Chen { 67da39c979SShuo Chen start_ = data_.get(); 68da39c979SShuo Chen avail_ = ::pread(fd_, start_, buffer_size_, offset_); 69da39c979SShuo Chen } 70da39c979SShuo Chen 71da39c979SShuo Chen const std::string filename_; 72da39c979SShuo Chen const int fd_; 73da39c979SShuo Chen const int buffer_size_; 74da39c979SShuo Chen int64_t offset_ = 0; // file position 75da39c979SShuo Chen 76da39c979SShuo Chen char* start_ = nullptr; 77da39c979SShuo Chen int avail_ = 0; 78da39c979SShuo Chen std::unique_ptr<char[]> data_; 79da39c979SShuo Chen 80da39c979SShuo Chen std::string word_; 81da39c979SShuo Chen int64_t count_ = 0; 82da39c979SShuo Chen 83da39c979SShuo Chen SegmentInput(const SegmentInput&) = delete; 84da39c979SShuo Chen void operator=(const SegmentInput&) = delete; 85da39c979SShuo Chen}; 86da39c979SShuo Chen 87da39c979SShuo Chen 88