input.h revision da39c979
1da39c979SShuo Chen#pragma once
2da39c979SShuo Chen
3da39c979SShuo Chen#include <string>
4da39c979SShuo Chen#include <fcntl.h>
5da39c979SShuo Chen
6da39c979SShuo Chenclass SegmentInput
7da39c979SShuo Chen{
8da39c979SShuo Chen public:
9da39c979SShuo Chen  explicit SegmentInput(const char* filename, int bufsize=kBufferSize)
10da39c979SShuo Chen    : filename_(filename),
11da39c979SShuo Chen      fd_(::open(filename, O_RDONLY)),
12da39c979SShuo Chen      buffer_size_(bufsize),
13da39c979SShuo Chen      data_(new char[buffer_size_])
14da39c979SShuo Chen  {
15da39c979SShuo Chen    refill();
16da39c979SShuo Chen  }
17da39c979SShuo Chen
18da39c979SShuo Chen  const std::string& filename() const { return filename_; }
19da39c979SShuo Chen  int64_t tell() const { return offset_; }
20da39c979SShuo Chen  const std::string& current_word() const { return word_; }
21da39c979SShuo Chen  int64_t current_count() const { return count_; }
22da39c979SShuo Chen
23da39c979SShuo Chen  bool next()
24da39c979SShuo Chen  {
25da39c979SShuo Chen    if (avail_ <= 0)
26da39c979SShuo Chen      return false;
27da39c979SShuo Chen    char* nl = static_cast<char*>(::memchr(start_, '\n', avail_));
28da39c979SShuo Chen    if (nl)
29da39c979SShuo Chen    {
30da39c979SShuo Chen      char* tab = static_cast<char*>(::memchr(start_, '\t', nl - start_));
31da39c979SShuo Chen      if (tab)
32da39c979SShuo Chen      {
33da39c979SShuo Chen        count_ = strtol(tab+1, NULL, 10);
34da39c979SShuo Chen        word_ = std::string_view(start_, tab-start_);
35da39c979SShuo Chen
36da39c979SShuo Chen        int len = nl - start_ + 1;
37da39c979SShuo Chen        avail_ -= len;
38da39c979SShuo Chen        offset_ += len;
39da39c979SShuo Chen        assert(avail_ >= 0);
40da39c979SShuo Chen        if (avail_ == 0)
41da39c979SShuo Chen        {
42da39c979SShuo Chen          refill();
43da39c979SShuo Chen        }
44da39c979SShuo Chen        else
45da39c979SShuo Chen        {
46da39c979SShuo Chen          start_ += len;
47da39c979SShuo Chen        }
48da39c979SShuo Chen        return true;
49da39c979SShuo Chen      }
50da39c979SShuo Chen      else
51da39c979SShuo Chen      {
52da39c979SShuo Chen        avail_ = 0;
53da39c979SShuo Chen        assert(0);
54da39c979SShuo Chen        return false;
55da39c979SShuo Chen      }
56da39c979SShuo Chen    }
57da39c979SShuo Chen    else
58da39c979SShuo Chen    {
59da39c979SShuo Chen      refill();
60da39c979SShuo Chen      return next();
61da39c979SShuo Chen    }
62da39c979SShuo Chen  }
63da39c979SShuo Chen
64da39c979SShuo Chen private:
65da39c979SShuo Chen  void refill()
66da39c979SShuo Chen  {
67da39c979SShuo Chen    start_ = data_.get();
68da39c979SShuo Chen    avail_ = ::pread(fd_, start_, buffer_size_, offset_);
69da39c979SShuo Chen  }
70da39c979SShuo Chen
71da39c979SShuo Chen  const std::string filename_;
72da39c979SShuo Chen  const int fd_;
73da39c979SShuo Chen  const int buffer_size_;
74da39c979SShuo Chen  int64_t offset_ = 0;  // file position
75da39c979SShuo Chen
76da39c979SShuo Chen  char* start_ = nullptr;
77da39c979SShuo Chen  int avail_ = 0;
78da39c979SShuo Chen  std::unique_ptr<char[]> data_;
79da39c979SShuo Chen
80da39c979SShuo Chen  std::string word_;
81da39c979SShuo Chen  int64_t count_ = 0;
82da39c979SShuo Chen
83da39c979SShuo Chen  SegmentInput(const SegmentInput&) = delete;
84da39c979SShuo Chen  void operator=(const SegmentInput&) = delete;
85da39c979SShuo Chen};
86da39c979SShuo Chen
87da39c979SShuo Chen
88