diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e5f4ae..340c53b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ## [Unreleased] +### Added + +- Add `errors` option in `TorrentFileParser` and `parse_torrent_file` to let user set the encoding error handler. (Thanks [@yasuotakei](https://github.com/yasuotakei)) +- Add `-e`/`--error` to CLI option to set the `errors` option of `parse_torrent_file`. +- `BDecoder` class and `decode` shortcut function to directly decode bytes. +- `decode` shortcut function to directly encode data to bytes. + +### Changed + +- **BreakChange** `TorrentFileCreator` rename to `BEncoder` as the origin name don't describe its function. +- `TorrentFileParser` don't need the outmost level of parsed data to be a `dict` now +- `BEncoder` don't need the outmost level of encoded data to be a `dict` now + ## [0.2.0] - 2018.5.25 ### Change @@ -17,7 +30,7 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0. ### Added -- `TorrentFileCreator` class and `create_torrent_file` shortcut function for write back data to a torrent file +- `TorrentFileCreator` class and `create_torrent_file` shortcut function for write back data to a torrent file. ## [0.1.4] - 2018-04-06 diff --git a/LICENSE b/LICENSE index a4edad4..70ebd9a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,6 +1,6 @@ The MIT License (MIT) -Copyright (c) 2017 7sDream +Copyright (c) 2017 - 2018 7sDream Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal diff --git a/README.md b/README.md index a71a935..7fb6dc8 100644 --- a/README.md +++ b/README.md @@ -4,6 +4,15 @@ A simple parser for `.torrent` file. Can also edit and write back to torrent format after version 0.2.0. +## Features + +- Decoder and encoder for torrent files +- Auto decode bytes field to string with used specified encoding and error handler +- Auto detect encoding when use `auto` as encoding(need `chardet` installed) +- Auto decode hash value filed to hash blocks +- Uniform exception type +- CLI provided, with JSON output + ## Install ``` @@ -34,12 +43,23 @@ $ cat test.torrent | pytp ```pycon >>> import torrent_parser as tp >>> data = tp.parse_torrent_file('test.torrent') ->>> print(data['announce']) +>>> data['announce'] http://tracker.trackerfix.com:80/announce >>> data['announce'] = 'http://127.0.0.1:12345' >>> tp.create_torrent_file('new.torrent', data) ``` +or you don't operate with file, just raw bytes: + +```pycon +>>> import torrent_parser as tp +>>> data = tp.decode(b'd3:negi-1ee') +>>> data['neg'] +-1 +>>> tp.encode(data) +b'd3:negi-1ee' +``` + ## Test ```bash @@ -58,4 +78,4 @@ See [License][LICENSE]. [screenshots-normal]: http://rikka-10066868.image.myqcloud.com/1492616d-9f14-4fe2-9146-9a3ac06c6868.png [screenshots-indent]: http://rikka-10066868.image.myqcloud.com/eadc4184-6deb-42eb-bfd4-239da8f50c08.png [LICENSE]: https://github.com/7sDream/torrent_parser/blob/master/LICENSE -[CHANGELOG]: https://github.com/7sDream/torrent_parser/blob/master/CHANGELOG.md \ No newline at end of file +[CHANGELOG]: https://github.com/7sDream/torrent_parser/blob/master/CHANGELOG.md diff --git a/tests/__init__.py b/tests/__init__.py index fb128c4..10492bb 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -1,2 +1,5 @@ from .test_create import * from .test_parse import * +from .test_encoding_error import * +from .test_encode import * +from .test_decode import * diff --git a/tests/test_create.py b/tests/test_create.py index 8826896..d051505 100644 --- a/tests/test_create.py +++ b/tests/test_create.py @@ -6,7 +6,7 @@ import io import os.path import unittest -from torrent_parser import TorrentFileParser, TorrentFileCreator +from torrent_parser import TorrentFileParser, BEncoder class TestCreate(unittest.TestCase): @@ -17,15 +17,19 @@ class TestCreate(unittest.TestCase): data = collections.OrderedDict() data['a'] = 1 data['b'] = 2 - self.assertEqual(TorrentFileCreator(data).encode(), b'd1:ai1e1:bi2ee') + self.assertEqual(BEncoder(data).encode(), b'd1:ai1e1:bi2ee') def test_same_output_if_no_edit(self): with open(self.REAL_FILE, 'rb') as fp: in_data = fp.read() data = TorrentFileParser(io.BytesIO(in_data), True).parse() - out_data = TorrentFileCreator(data).encode() + out_data = BEncoder(data).encode() m1 = hashlib.md5() m1.update(in_data) m2 = hashlib.md5() m2.update(out_data) self.assertEqual(m1.digest(), m2.digest()) + + def test_dont_need_dict_outmost(self): + data = 123456 + self.assertEqual(BEncoder(data).encode(), b'i123456e') diff --git a/tests/test_decode.py b/tests/test_decode.py new file mode 100644 index 0000000..4257277 --- /dev/null +++ b/tests/test_decode.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import unittest + +from torrent_parser import decode + + +class TestDecode(unittest.TestCase): + + def test_decode(self): + self.assertEqual(decode(b'i12345e'), 12345) diff --git a/tests/test_encode.py b/tests/test_encode.py new file mode 100644 index 0000000..7914bac --- /dev/null +++ b/tests/test_encode.py @@ -0,0 +1,11 @@ +from __future__ import unicode_literals + +import unittest + +from torrent_parser import encode + + +class TestEncode(unittest.TestCase): + + def test_encode(self): + self.assertEqual(encode(12345), b'i12345e') diff --git a/tests/test_encoding_error.py b/tests/test_encoding_error.py new file mode 100644 index 0000000..a95a950 --- /dev/null +++ b/tests/test_encoding_error.py @@ -0,0 +1,25 @@ +from __future__ import unicode_literals + +import os.path +import unittest + +from torrent_parser import ( + TorrentFileParser, parse_torrent_file, InvalidTorrentDataException +) + + +class TestDecodingError(unittest.TestCase): + TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), 'test_files') + FILE = os.path.join(TEST_FILES_DIR, 'utf8.encoding.error.torrent') + + def test_default_option_will_raise_exception(self): + with self.assertRaises(InvalidTorrentDataException): + parse_torrent_file(self.FILE) + with self.assertRaises(InvalidTorrentDataException): + with open(self.FILE, 'rb') as f: + TorrentFileParser(f).parse() + + def test_not_raise_exception_when_use_ignore(self): + parse_torrent_file(self.FILE, errors='ignore') + with open(self.FILE, 'rb') as f: + TorrentFileParser(f, errors='ignore').parse() diff --git a/tests/test_files/outmost.string.torrent b/tests/test_files/outmost.string.torrent new file mode 100644 index 0000000..d15055f --- /dev/null +++ b/tests/test_files/outmost.string.torrent @@ -0,0 +1 @@ +8:announce \ No newline at end of file diff --git a/tests/test_files/utf8.encoding.error.torrent b/tests/test_files/utf8.encoding.error.torrent new file mode 100644 index 0000000..3f159d3 Binary files /dev/null and b/tests/test_files/utf8.encoding.error.torrent differ diff --git a/tests/test_parse.py b/tests/test_parse.py index c660b39..80962b5 100644 --- a/tests/test_parse.py +++ b/tests/test_parse.py @@ -11,6 +11,7 @@ class TestParse(unittest.TestCase): TEST_FILES_DIR = os.path.join(os.path.dirname(__file__), 'test_files') REAL_FILE = os.path.join(TEST_FILES_DIR, 'real.torrent') NEG_FILE = os.path.join(TEST_FILES_DIR, 'neg.torrent') + STRING_FILE = os.path.join(TEST_FILES_DIR, 'outmost.string.torrent') def test_parse_torrent_file_use_shortcut(self): parse_torrent_file(self.REAL_FILE) @@ -53,6 +54,10 @@ class TestParse(unittest.TestCase): data = parse_torrent_file(self.NEG_FILE) self.assertEqual(data['neg'], -1) + def test_dont_need_dict_outmost(self): + data = parse_torrent_file(self.STRING_FILE) + self.assertEqual(data, 'announce') + if __name__ == '__main__': unittest.main() diff --git a/torrent_parser.py b/torrent_parser.py old mode 100644 new mode 100755 index 97777f5..b3909da --- a/torrent_parser.py +++ b/torrent_parser.py @@ -62,11 +62,14 @@ except NameError: str_type = str __all__ = [ + 'InvalidTorrentDataException', + 'BEncoder', + 'BDecoder', + 'encode', + 'decode', + 'TorrentFileParser', 'create_torrent_file', 'parse_torrent_file', - 'InvalidTorrentDataException', - 'TorrentFileCreator', - 'TorrentFileParser', ] __version__ = '0.2.0' @@ -105,7 +108,7 @@ class TorrentFileParser(object): STRING_INDICATOR = b'' STRING_DELIMITER = b':' - RAW_FIELD_PARAMS = { + HASH_FIELD_PARAMS = { # field length need_list 'pieces': (20, True), 'ed2k': (16, False), @@ -120,14 +123,18 @@ class TorrentFileParser(object): (TYPE_STRING, STRING_INDICATOR), ] - def __init__(self, fp, use_ordered_dict=False, encoding='utf-8'): + def __init__( + self, fp, use_ordered_dict=False, encoding='utf-8', errors='strict' + ): """ :param fp: a **binary** file-like object to parse, which means need 'b' mode when use built-in open function - :param encoding: file content encoding, default utf-8, use 'auto' to - enable charset auto detection ('chardet' package should be installed) - :param use_ordered_dict: Use collections.OrderedDict as dict container - default False, which mean use built-in dict + :param bool use_ordered_dict: Use collections.OrderedDict as dict + container default False, which mean use built-in dict + :param string encoding: file content encoding, default utf-8, use 'auto' + to enable charset auto detection (need 'chardet' package installed) + :param string errors: how to deal with encoding error when try to parse + string from content with ``encoding`` """ if getattr(fp, 'read', ) is None \ or getattr(fp, 'seek') is None: @@ -137,12 +144,14 @@ class TorrentFileParser(object): self._encoding = encoding self._content = fp self._use_ordered_dict = use_ordered_dict + self._error_handler = errors def parse(self): """ :return: the parse result - :type: depends on ``use_ordered_dict`` option when init the parser - see :any:`TorrentFileParser.__init__` + :rtype: dict|list|int|string + :raise: :any:`InvalidTorrentDataException` when parse failed or error + happened when decode string using specified encoding """ self._restart() data = self._next_element() @@ -155,10 +164,7 @@ class TorrentFileParser(object): except EOFError: # expect EOF pass - if isinstance(data, dict): - return data - - raise InvalidTorrentDataException('Outermost element is not a dict') + return data def _read_byte(self, count=1, raise_eof=False): assert count >= 0 @@ -186,9 +192,8 @@ class TorrentFileParser(object): k = self._next_element() if k is _END: return - if k in self.RAW_FIELD_PARAMS: - length, need_list = self.RAW_FIELD_PARAMS[k] - v = self._next_hash(length, need_list) + if k in self.HASH_FIELD_PARAMS: + v = self._next_hash(*self.HASH_FIELD_PARAMS[k]) else: v = self._next_element() if k == 'encoding': @@ -225,25 +230,26 @@ class TorrentFileParser(object): char = self._read_byte(1) return -value if neg else value - def _next_string(self, decode=True): + def _next_string(self, need_decode=True): length = self._next_int(self.STRING_DELIMITER) raw = self._read_byte(length) - if decode: + if need_decode: encoding = self._encoding if encoding == 'auto': - encoding = detect(raw) + self.encoding = encoding = detect(raw) try: - string = raw.decode(encoding, "ignore") + string = raw.decode(encoding, self._error_handler) except UnicodeDecodeError as e: raise InvalidTorrentDataException( self._pos - length + e.start, - "Fail to decode string at pos {pos} using " + e.encoding + "Fail to decode string at pos {pos} using encoding " + + e.encoding ) return string return raw def _next_hash(self, p_len, need_list): - raw = self._next_string(decode=False) + raw = self._next_string(need_decode=False) if len(raw) % p_len != 0: raise InvalidTorrentDataException( self._pos - len(raw), "Hash bit length not match at pos {pos}" @@ -280,7 +286,7 @@ class TorrentFileParser(object): return element -class TorrentFileCreator(object): +class BEncoder(object): TYPES = { (dict,): TorrentFileParser.TYPE_DICT, @@ -291,27 +297,25 @@ class TorrentFileCreator(object): def __init__(self, data, encoding='utf-8'): """ - :param data: torrent data, must be a dict or OrderedDict - :param encoding: string field output encoding + :param dict|list|int|string data: data will be encoded + :param string encoding: string field output encoding """ - if not isinstance(data, dict): - raise InvalidTorrentDataException( - None, - "Top level structure should be a dict" - ) self._data = data self._encoding = encoding def encode(self): """ - Encode data to bytes that conform to torrent file format + Encode to bytes + + :rtype: bytes """ return b''.join(self._output_element(self._data)) - def encode_to_readable(self): + def encode_to_filelike(self): """ - Encode data to a file-like(BytesIO) object which contains the result of - `TorrentFileCreator.encode()` + Encode to a file-like(BytesIO) object + + :rtype: BytesIO """ return io.BytesIO(self.encode()) @@ -364,7 +368,7 @@ class TorrentFileCreator(object): ) for x in self._output_element(k): yield x - if k in TorrentFileParser.RAW_FIELD_PARAMS: + if k in TorrentFileParser.HASH_FIELD_PARAMS: for x in self._output_decode_hash(v): yield x else: @@ -393,29 +397,79 @@ class TorrentFileCreator(object): ) -def parse_torrent_file(filename, use_ordered_dict=False): +class BDecoder(object): + def __init__( + self, data, use_ordered_dict=False, encoding='utf-8', errors='strict' + ): + """ + :param bytes data: raw data to be decoded + :param bool use_ordered_dict: see :any:`TorrentFileParser.__init__` + :param string encoding: see :any:`TorrentFileParser.__init__` + :param string errors: see :any:`TorrentFileParser.__init__` + """ + self._data = bytes(data) + self._use_ordered_dict = use_ordered_dict + self._encoding = encoding + self._errors = errors + + def decode(self): + return TorrentFileParser( + io.BytesIO(self._data), self._use_ordered_dict, self._encoding, + self._errors, + ).parse() + + +def encode(data, encoding='utf-8'): + """ + Shortcut function for encode python object to torrent file format(bencode) + + :param dict|list|int|string data: data to be encoded + :param string encoding: see :any:`TorrentFileParser.__init__` + :rtype: bytes + """ + return BEncoder(data, encoding).encode() + + +def decode(data, use_ordered_dict=False, encoding='utf-8', errors='strict'): + """ + Shortcut function for decode bytes as torrent file format(bencode) to python + object + + :param bytes data: raw data to be decoded + :param bool use_ordered_dict: see :any:`TorrentFileParser.__init__` + :param string encoding: see :any:`TorrentFileParser.__init__` + :param string errors: see :any:`TorrentFileParser.__init__` + :rtype: dict|list|int|string + """ + return BDecoder(data, use_ordered_dict, encoding, errors).decode() + + +def parse_torrent_file( + filename, use_ordered_dict=False, encoding='utf-8', errors='strict', +): """ Shortcut function for parse torrent object using TorrentFileParser :param string filename: torrent filename :param bool use_ordered_dict: see :any:`TorrentFileParser.__init__` - :rtype: dict if ``use_ordered_dict`` is false, - collections.OrderedDict otherwise + :param string encoding: see :any:`TorrentFileParser.__init__` + :param string errors: see :any:`TorrentFileParser.__init__` + :rtype: dict|list|int|string """ with open(filename, 'rb') as f: - return TorrentFileParser(f, use_ordered_dict).parse() + return TorrentFileParser(f, use_ordered_dict, encoding, errors).parse() def create_torrent_file(filename, data, encoding='utf-8'): """ Shortcut function for create a torrent file using TorrentFileCreator - :param filename: output torrent filename - :param data: torrent data, must be a dict or OrderedDict - :param encoding: string field output encoding + :param string filename: output torrent filename + :param dict|list|int|string data: torrent data + :param string encoding: string field output encoding """ with open(filename, 'wb') as f: - f.write(TorrentFileCreator(data, encoding).encode()) + f.write(BEncoder(data, encoding).encode()) def __main(): @@ -432,7 +486,10 @@ def __main(): help='ensure output json use ascii char, ' 'escape other char use \\u') parser.add_argument('--coding', '-c', default='utf-8', - help='string encoding, default utf-8') + help='string encoding, default "utf-8"') + parser.add_argument('--errors', '-e', default='strict', + help='decoding error handler, default "strict", you can' + ' use "ignore" or "replace" to avoid exception') parser.add_argument('--version', '-v', action='store_true', default=False, help='print version and exit') args = parser.parse_args() @@ -453,7 +510,9 @@ def __main(): exit(1) # noinspection PyUnboundLocalVariable - data = TorrentFileParser(target_file, not args.dict, args.coding).parse() + data = TorrentFileParser( + target_file, not args.dict, args.coding, args.errors + ).parse() data = json.dumps( data, ensure_ascii=args.ascii,