the refinery.units.pattern.carve documentation

Expand source code Browse git
from __future__ import annotations

import msgpack

from refinery.lib.patterns import formats, pattern_with_size_limits
from refinery.lib.types import Param
from refinery.units import Chunk
from refinery.units.pattern import Arg, PatternExtractor


class carve(PatternExtractor):
    """
    Extracts and optionally decodes data in named formats from the input: base64, hex, string literals.

    The complete list of supported formats is as follows:\n\n{}
    """
    def __init__(
        self, format: Param[str, Arg.String(metavar='format', help='Specify one of the available long or short format specifiers.')],
        unique: Param[bool, Arg.Switch('-q', help='Yield every match only once.')] = False,
        decode: Param[bool, Arg.Switch('-d', help='Automatically decode known patterns.')] = False,
        single: Param[bool, Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1')] = False,
        min=1, max=0, len=0,
        stripspace=False, longest=False, take=0, utf16=True, ascii=True
    ):
        if single:
            take = 1
            longest = True
            unique = True
        try:
            format = formats.from_dashname(format)
        except Exception:
            raise ValueError(F'{format} is not a valid format')
        super().__init__(
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            duplicates=not unique,
            longest=longest,
            take=take,
            ascii=ascii,
            utf16=utf16,
            format=format
        )
        if not decode:
            decoder = None
        elif self.args.format == formats.str:
            from ..encoding.esc import esc
            decoder = esc(unicode=True, quoted=True)
        elif self.args.format == formats.int:
            from ..encoding.base import base
            decoder = base()
        elif self.args.format in (formats.base16, formats.base16s, formats.hex):
            from ..encoding.hex import hex
            decoder = hex()
        elif self.args.format == formats.hexdump:
            from ..formats.hexload import hexload
            decoder = hexload()
        elif self.args.format == formats.intarray:
            from ..blockwise.pack import pack
            decoder = pack()
        elif self.args.format == formats.strarray:
            from ..encoding.esc import esc
            def _decoder(data: Chunk) -> bytes: # noqa
                return msgpack.packb([
                    m[0] | esc | bytes for m in formats.str.value.finditer(data)]) or B''
            decoder = _decoder
        elif self.args.format in (formats.base64, formats.base64s):
            from ..encoding.b64 import b64
            decoder = b64()
        elif self.args.format in (formats.base85, formats.base85s):
            from ..encoding.b85 import b85
            decoder = b85()
        elif self.args.format == formats.base64u:
            from ..encoding.b64 import b64
            decoder = b64(urlsafe=True)
        elif self.args.format == formats.base32:
            from ..encoding.b32 import b32
            decoder = b32()
        elif self.args.format == formats.ps1str:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format == formats.htmlesc:
            from ..encoding.htmlesc import htmlesc
            decoder = htmlesc()
        elif self.args.format == formats.vbastr:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format == formats.hexarray:
            from ..blockwise.pack import pack
            decoder = pack(0x10)
        elif self.args.format == formats.wshenc:
            from ..encoding.wshenc import wshenc
            decoder = wshenc()
        elif self.args.format == formats.uuenc:
            from ..encoding.uuenc import uuenc
            decoder = uuenc()
        elif self.args.format in (
            formats.urlquote,
            formats.urlhex,
        ):
            from ..encoding.url import url
            decoder = url()
        else:
            decoder = None
        self.decoder = decoder

    def process(self, data):
        sizes = self._getbounds()
        pattern = pattern_with_size_limits(
            self.args.format.value, max(1, sizes.min), abs(sizes.max))
        self.log_info('using pattern:', pattern.str.pattern)
        it = iter(self.matches_filtered(memoryview(data), pattern.bin))
        if (decoder := self.decoder) is None:
            yield from it
        else:
            for chunk in it:
                try:
                    yield decoder(chunk)
                except Exception as E:
                    self.log_info(F'decoder failure: {E!s}')


if __d := carve.__doc__:
    carve.__doc__ = __d.format(formats.make_table_with_shorts('FORMAT'))


class csd(carve):
    """
    Short for carve & decode; carves the single largest buffer of a given format from the input
    and decodes it with the appropriate decoder. See the carve help for detailed information on
    format specifiers.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=True,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )


class csb(carve):
    """
    Short for carve single buffer; carves the single largest buffer of a given format from the
    input data and returns it. See the carve help for detailed information on format specifiers.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=False,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )
class carve (format, unique=False, decode=False, single=False, min=1, max=0, len=0, stripspace=False, longest=False, take=0, utf16=True, ascii=True)

Extracts and optionally decodes data in named formats from the input: base64, hex, string literals.

The complete list of supported formats is as follows:

   FORMAT SHORT DESCRIPTION
  integer int   any integer literal expression
    float flt   floating point literals
   number num   either an integer or a float
   string str   c-syntax string literal
   cmdstr       Windows command line escaped string literal
   ps1str       PowerShell escaped string literal
   vbastr       VBS/VBA string literal
   vbaint       VBS/VBA integer literal
printable ps    printable strings (includes whitespace)
 urlquote uq    url-encoded characters, default char set
   urlhex uh    hex-encoded buffer using URL escape sequences
  htmlesc       sequence of HTML-escape characters
 intarray [int] integers separated by commas or semicolons
 strarray [str] strings separated by commas or semicolons
 numarray [num] numbers separated by commas or semicolons
 hexarray [hex] hex sequences separated by commas or semicolons
  letters       alphabetic characters
   wshenc       encoded Windows Scripting Host Scripts (JS/VBS)
    alnum       alphanumeric characters
   base32 b32   Base32 encoded strings
   base58 b58   Base58 encoded strings
   base62 b62   Base62 encoded strings
   base64 b64   Base64 encoded strings
   base85 b85   Base85 encoded strings
  ascii85 a85   Ascii85 encoded strings
      z85       Z85 encoded strings
   base92 b92   Base92 encoded strings
  base64u b64u  Base64 encoded strings using URL-safe alphabet
      hex       hexadecimal strings
   base16 b16   uppercase hexadecimal strings
  base16s b16s  hexadecimal strings
  base64s b64s  Base64 encoded strings, separated by whitespace
  base85s b85s  Base85 encoded string, separated by whitespace
     a85s       Ascii85 encoded string, separated by whitespace
     z85s       Z85 encoded string, separated by whitespace
     utf8       sequences of bytes that can be decoded as UTF8
  hexdump hd    typical hexdump output
    uuenc       UUEncoded data
Expand source code Browse git
class carve(PatternExtractor):
    """
    Extracts and optionally decodes data in named formats from the input: base64, hex, string literals.

    The complete list of supported formats is as follows:\n\n{}
    """
    def __init__(
        self, format: Param[str, Arg.String(metavar='format', help='Specify one of the available long or short format specifiers.')],
        unique: Param[bool, Arg.Switch('-q', help='Yield every match only once.')] = False,
        decode: Param[bool, Arg.Switch('-d', help='Automatically decode known patterns.')] = False,
        single: Param[bool, Arg.Switch('-s', help='Only get the biggest match; equivalent to -qlt1')] = False,
        min=1, max=0, len=0,
        stripspace=False, longest=False, take=0, utf16=True, ascii=True
    ):
        if single:
            take = 1
            longest = True
            unique = True
        try:
            format = formats.from_dashname(format)
        except Exception:
            raise ValueError(F'{format} is not a valid format')
        super().__init__(
            min=min,
            max=max,
            len=len,
            stripspace=stripspace,
            duplicates=not unique,
            longest=longest,
            take=take,
            ascii=ascii,
            utf16=utf16,
            format=format
        )
        if not decode:
            decoder = None
        elif self.args.format == formats.str:
            from ..encoding.esc import esc
            decoder = esc(unicode=True, quoted=True)
        elif self.args.format == formats.int:
            from ..encoding.base import base
            decoder = base()
        elif self.args.format in (formats.base16, formats.base16s, formats.hex):
            from ..encoding.hex import hex
            decoder = hex()
        elif self.args.format == formats.hexdump:
            from ..formats.hexload import hexload
            decoder = hexload()
        elif self.args.format == formats.intarray:
            from ..blockwise.pack import pack
            decoder = pack()
        elif self.args.format == formats.strarray:
            from ..encoding.esc import esc
            def _decoder(data: Chunk) -> bytes: # noqa
                return msgpack.packb([
                    m[0] | esc | bytes for m in formats.str.value.finditer(data)]) or B''
            decoder = _decoder
        elif self.args.format in (formats.base64, formats.base64s):
            from ..encoding.b64 import b64
            decoder = b64()
        elif self.args.format in (formats.base85, formats.base85s):
            from ..encoding.b85 import b85
            decoder = b85()
        elif self.args.format == formats.base64u:
            from ..encoding.b64 import b64
            decoder = b64(urlsafe=True)
        elif self.args.format == formats.base32:
            from ..encoding.b32 import b32
            decoder = b32()
        elif self.args.format == formats.ps1str:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format == formats.htmlesc:
            from ..encoding.htmlesc import htmlesc
            decoder = htmlesc()
        elif self.args.format == formats.vbastr:
            from ..encoding.escps import escps
            decoder = escps()
        elif self.args.format == formats.hexarray:
            from ..blockwise.pack import pack
            decoder = pack(0x10)
        elif self.args.format == formats.wshenc:
            from ..encoding.wshenc import wshenc
            decoder = wshenc()
        elif self.args.format == formats.uuenc:
            from ..encoding.uuenc import uuenc
            decoder = uuenc()
        elif self.args.format in (
            formats.urlquote,
            formats.urlhex,
        ):
            from ..encoding.url import url
            decoder = url()
        else:
            decoder = None
        self.decoder = decoder

    def process(self, data):
        sizes = self._getbounds()
        pattern = pattern_with_size_limits(
            self.args.format.value, max(1, sizes.min), abs(sizes.max))
        self.log_info('using pattern:', pattern.str.pattern)
        it = iter(self.matches_filtered(memoryview(data), pattern.bin))
        if (decoder := self.decoder) is None:
            yield from it
        else:
            for chunk in it:
                try:
                    yield decoder(chunk)
                except Exception as E:
                    self.log_info(F'decoder failure: {E!s}')

Ancestors

Subclasses

Class variables

var reverse

The type of the None singleton.

Inherited members

class csd (format, utf16=True, ascii=True, stripspace=False)

Short for carve & decode; carves the single largest buffer of a given format from the input and decodes it with the appropriate decoder. See the carve help for detailed information on format specifiers.

Expand source code Browse git
class csd(carve):
    """
    Short for carve & decode; carves the single largest buffer of a given format from the input
    and decodes it with the appropriate decoder. See the carve help for detailed information on
    format specifiers.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=True,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )

Ancestors

Subclasses

Inherited members

class csb (format, utf16=True, ascii=True, stripspace=False)

Short for carve single buffer; carves the single largest buffer of a given format from the input data and returns it. See the carve help for detailed information on format specifiers.

Expand source code Browse git
class csb(carve):
    """
    Short for carve single buffer; carves the single largest buffer of a given format from the
    input data and returns it. See the carve help for detailed information on format specifiers.
    """
    def __init__(self, format, utf16=True, ascii=True, stripspace=False):
        super().__init__(
            format,
            decode=False,
            single=True,
            utf16=utf16,
            ascii=ascii,
            stripspace=stripspace,
        )

Ancestors

Subclasses

Inherited members