llvm/utils/extract-section.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153

#!/usr/bin/env python
from __future__ import print_function

"""
Helper script to print out the raw content of an ELF section.
Example usages:
```
# print out as bits by default
extract-section.py .text --input-file=foo.o
```
```
# read from stdin and print out in hex
cat foo.o | extract-section.py -h .text
```
This is merely a wrapper around `llvm-readobj` that focuses on the binary
content as well as providing more formatting options.
"""

# Unfortunately reading binary from stdin is not so trivial in Python...
def read_raw_stdin():
    import sys

    if sys.version_info >= (3, 0):
        reading_source = sys.stdin.buffer
    else:
        # Windows will always read as string so we need some
        # special handling
        if sys.platform == "win32":
            import os, msvcrt

            msvcrt.setformat(sys.stdin.fileno(), os.O_BINARY)
        reading_source = sys.stdin
    return reading_source.read()


def get_raw_section_dump(readobj_path, section_name, input_file):
    import subprocess

    cmd = [
        readobj_path,
        "--elf-output-style=GNU",
        "--hex-dump={}".format(section_name),
        input_file,
    ]
    proc = subprocess.Popen(cmd, stdin=subprocess.PIPE, stdout=subprocess.PIPE)

    if input_file == "-":
        # From stdin
        out, _ = proc.communicate(input=read_raw_stdin())
    else:
        out, _ = proc.communicate()

    return out.decode("utf-8") if type(out) is not str else out


if __name__ == "__main__":
    import argparse

    # The default '-h' (--help) will conflict with our '-h' (hex) format
    arg_parser = argparse.ArgumentParser(add_help=False)
    arg_parser.add_argument(
        "--readobj-path",
        metavar="<executable path>",
        type=str,
        help="Path to llvm-readobj",
    )
    arg_parser.add_argument(
        "--input-file",
        metavar="<file>",
        type=str,
        help="Input object file, or '-' to read from stdin",
    )
    arg_parser.add_argument(
        "section", metavar="<name>", type=str, help="Name of the section to extract"
    )
    # Output format
    format_group = arg_parser.add_mutually_exclusive_group()
    format_group.add_argument(
        "-b",
        dest="format",
        action="store_const",
        const="bits",
        help="Print out in bits",
    )
    arg_parser.add_argument(
        "--byte-indicator",
        action="store_true",
        help="Whether to print a '.' every 8 bits in bits printing mode",
    )
    arg_parser.add_argument(
        "--bits-endian",
        metavar="<little/big>",
        type=str,
        choices=["little", "big"],
        help="Print out bits in specified endianness (little or big); defaults to big",
    )
    format_group.add_argument(
        "-h",
        dest="format",
        action="store_const",
        const="hex",
        help="Print out in hexadecimal",
    )
    arg_parser.add_argument(
        "--hex-width",
        metavar="<# of bytes>",
        type=int,
        help="The width (in byte) of every element in hex printing mode",
    )

    arg_parser.add_argument("--help", action="help")
    arg_parser.set_defaults(
        format="bits",
        tool_path="llvm-readobj",
        input_file="-",
        byte_indicator=False,
        hex_width=4,
        bits_endian="big",
    )
    args = arg_parser.parse_args()

    raw_section = get_raw_section_dump(args.tool_path, args.section, args.input_file)

    results = []
    for line in raw_section.splitlines(False):
        if line.startswith("Hex dump"):
            continue
        parts = line.strip().split(" ")[1:]
        for part in parts[:4]:
            # exclude any non-hex dump string
            try:
                val = int(part, 16)
                if args.format == "bits":
                    # divided into bytes first
                    offsets = (24, 16, 8, 0)
                    if args.bits_endian == "little":
                        offsets = (0, 8, 16, 24)
                    for byte in [(val >> off) & 0xFF for off in offsets]:
                        for bit in [(byte >> off) & 1 for off in range(7, -1, -1)]:
                            results.append(str(bit))
                        if args.byte_indicator:
                            results.append(".")
                elif args.format == "hex":
                    assert args.hex_width <= 4 and args.hex_width > 0
                    width_bits = args.hex_width * 8
                    offsets = [off for off in range(32 - width_bits, -1, -width_bits)]
                    mask = (1 << width_bits) - 1
                    format_str = "{:0" + str(args.hex_width * 2) + "x}"
                    for word in [(val >> i) & mask for i in offsets]:
                        results.append(format_str.format(word))
            except:
                break
    print(" ".join(results), end="")