kodereviewer/diff_parser.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146

import logging
import re
from collections.abc import Sequence

from rich.logging import RichHandler

HEADER_RE = re.compile(r'@@ -(?P<original_start_line>\d+),(?P<original_line_count>\d+)'
                       r' \+(?P<new_start_line>\d+),(?P<new_line_count>\d+)'
                       r' @@ ?(?P<context>.*)')

logging.basicConfig(
    level="INFO", format="%(message)s", datefmt="[%X]", handlers=[RichHandler()]
)
logger = logging.getLogger(__name__)


class SideBySideDiffLine:
    line: int
    original: str
    new: str

    def __init__(self, line: int, original: str, new: str):
        self.line = line
        self.original = original
        self.new = new


class Hunk:
    header: str
    lines: list[str]

    original_start_line: int
    original_line_count: int
    new_start_line: int
    new_line_count: int
    context: str

    original: list[str]
    new: list[str]

    def __init__(self, hunk_str: str):
        header, *body = hunk_str.splitlines()
        self.header = header
        self.lines = body

        self.parse_header(header)
        self.parse_body(body)

    def parse_header(self, header: str) -> None:
        """Parses a header.
        The format is: @@ -l,s +l,s @@ optional section heading
        where
          l: starting line
          s: number of lines

        example: @@ -13,10 +13,15 @@ on:
        """
        match = HEADER_RE.match(header)
        if match:
            self.original_start_line = int(match.group('original_start_line'))
            self.original_line_count = int(match.group('original_line_count'))
            self.new_start_line = int(match.group('new_start_line'))
            self.new_line_count = int(match.group('new_line_count'))
            self.context = match.group('context')
        else:
            raise ValueError(f'Error parsing {header}')

    def parse_body(self, lines: list[str]) -> None:
        self.original = []
        self.new = []

        for line in lines:
            if line.startswith('-'):
                self.original.append(line)
            elif line.startswith('+'):
                self.new.append(line)
            else:
                self.original.append(line)
                self.new.append(line)

    def side_by_side(self) -> Sequence[SideBySideDiffLine]:
        start = min(self.original_start_line, self.new_start_line)
        end = max(self.original_start_line + self.original_line_count, self.new_start_line, self.new_line_count)
        retval: list[SideBySideDiffLine] = []
        original_line_counter = 0
        new_line_counter = 0

        for line in range(start, end):
            original = ''
            new = ''
            if line in range(self.original_start_line, self.original_start_line + self.original_line_count):
                original = self.original[original_line_counter]
                original_line_counter += 1

            if line in range(self.new_start_line, self.new_start_line + self.new_line_count):
                new = self.new[new_line_counter]
                new_line_counter += 1

            retval.append(SideBySideDiffLine(line, original, new))

        return retval


class GithubDiffParser:
    diff: str

    def __init__(self, diff: str):
        self.diff = diff

    def hunks(self) -> list[Hunk]:
        retval: list[Hunk] = []

        hunk_str = ''
        for line in self.diff.splitlines():
            if line.startswith('@@'):
                logger.info('adding %s', line)
                if hunk_str != '':
                    retval.append(Hunk(hunk_str))
                hunk_str = ''
            hunk_str += line
            hunk_str += '\n'
        if hunk_str != '':
            retval.append(Hunk(hunk_str))
        return retval

if __name__ == '__main__':
    from rich.console import Console
    from rich.table import Table

    import sys
    filename = sys.argv[-1]

    with open(filename) as fp:
        parser = GithubDiffParser(fp.read())
        for hunk in parser.hunks():

            table = Table(title=hunk.context)
            table.add_column('line')
            table.add_column('original')
            table.add_column('new')

            for diff_line in hunk.side_by_side():
                table.add_row(str(diff_line.line), diff_line.original, diff_line.new)

            console = Console()
            console.print(table)