From c2ce49250b4542191a04e33d3697fbd247eac59f Mon Sep 17 00:00:00 2001 From: 7sDream <7seconddream@gmail.com> Date: Fri, 6 Apr 2018 13:41:51 +0800 Subject: [PATCH] Finish enhancement by issue #3 - Fixed: Support negative int - Fixed: seek_back not move pos - Added: Auto encoding --- CHANGELOG.md | 68 +++++++++++++++++++ README.md | 7 +- tests/__init__.py | 1 + test.py => tests/test_all.py | 27 ++++++-- tests/testfiles/neg.torrent | 1 + test.torrent => tests/testfiles/real.torrent | Bin torrent_parser.py | 35 ++++++++-- 7 files changed, 125 insertions(+), 14 deletions(-) create mode 100644 CHANGELOG.md create mode 100644 tests/__init__.py rename test.py => tests/test_all.py (64%) create mode 100644 tests/testfiles/neg.torrent rename test.torrent => tests/testfiles/real.torrent (100%) diff --git a/CHANGELOG.md b/CHANGELOG.md new file mode 100644 index 0000000..86e4501 --- /dev/null +++ b/CHANGELOG.md @@ -0,0 +1,68 @@ +# Changelog + +All notable changes to this project will be documented in this file. + +The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) +and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). + +## [Unreleased] + +### Added + +- `encoding` option can be `auto`, which will use `chardet` package to decide which encoding to use. If `chardet` is noe installed, will raise a warning and fallback to 'utf-8'. (Thanks to [@ltfychrise]) +- Add changelog. + +### Change + +- Reorganize test codes/files. + +### Fixed + +- Fix integer filed can't be negative bug. (Thanks to [@ltfychrise]) +- Fix `_seek_back` method not make `_pos` back bug. (Thanks to [@ltfychrise]) + +## [0.1.3] - 2017-06-21 + +### Added + +- Now `UnicodeDecodeError` is wrapped in `InvalidTorrentDataException`. + +### Fixed + +- Use `IOError` instead of `FileNotFoundError` in Python 2. + +### Changed + +- `InvalidTorrentFileException` rename to `InvalidTorrentDataException`. + +## [0.1.2] - 2017-06-21 + +### Changed + +- Emm, I don't know, I just changed the version code... + +## [0.1.1] - 2017-06-20 + +### Added + +- CLI add coding `--coding/-c` option for file string filed encoding. + +### Changed + +- `ed2k` and `filehash` field now use same structure as 'pieces'. + +## [0.1.0] - 2017-05-23 + +### Added + +- Parse torrent from file and data into a dict. +- CLI provided. +- Simple tests. +- Available on pip. + +[@ltfychrise]: https://github.com/ltfychrise +[Unreleased]: https://github.com/7sDream/torrent_parser/compare/v0.1.3...HEAD +[0.1.3]: https://github.com/7sDream/torrent_parser/compare/v0.1.2...v0.1.3 +[0.1.2]: https://github.com/7sDream/torrent_parser/compare/v0.1.1...v0.1.2 +[0.1.1]: https://github.com/7sDream/torrent_parser/compare/v0.1.0...v0.1.1 +[0.1.0]: https://github.com/7sDream/torrent_parser/tree/v0.1.0 \ No newline at end of file diff --git a/README.md b/README.md index dc8ddc5..c5f9938 100644 --- a/README.md +++ b/README.md @@ -37,9 +37,13 @@ http://tracker.trackerfix.com:80/announce ## Test ```bash -python -m unittest test +python -m unittest tests ``` +## Changelog + +See [Changelog][CHANGELOG]. + ## LICENSE See [License][LICENSE]. @@ -48,3 +52,4 @@ See [License][LICENSE]. [screenshots-normal]: http://rikka-10066868.image.myqcloud.com/1492616d-9f14-4fe2-9146-9a3ac06c6868.png [screenshots-indent]: http://rikka-10066868.image.myqcloud.com/eadc4184-6deb-42eb-bfd4-239da8f50c08.png [LICENSE]: https://github.com/7sDream/torrent_parser/blob/master/LICENSE +[CHANGELOG]: https://github.com/7sDream/torrent_parser/blob/master/CHANGELOG.md \ No newline at end of file diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 0000000..3bc8105 --- /dev/null +++ b/tests/__init__.py @@ -0,0 +1 @@ +from .test_all import * diff --git a/test.py b/tests/test_all.py similarity index 64% rename from test.py rename to tests/test_all.py index f86908f..e64525a 100644 --- a/test.py +++ b/tests/test_all.py @@ -7,31 +7,36 @@ from torrent_parser import TorrentFileParser, parse_torrent_file class Test(unittest.TestCase): - TEST_FILENAME = 'test.torrent' + REAL_FILE = 'tests/testfiles/real.torrent' + NEG_FILE = 'tests/testfiles/neg.torrent' def test_parse_torrent_file_use_shortcut(self): - parse_torrent_file(self.TEST_FILENAME) + parse_torrent_file(self.REAL_FILE) def test_parse_torrent_file_use_class(self): - with open(self.TEST_FILENAME, 'rb') as fp: + with open(self.REAL_FILE, 'rb') as fp: TorrentFileParser(fp).parse() + def test_encoding_auto(self): + with open(self.REAL_FILE, 'rb') as fp: + TorrentFileParser(fp, encoding='auto').parse() + def test_parse_torrent_file_to_ordered_dict(self): - data = parse_torrent_file(self.TEST_FILENAME, True) + data = parse_torrent_file(self.REAL_FILE, True) self.assertIsInstance(data, collections.OrderedDict) - with open(self.TEST_FILENAME, 'rb') as fp: + with open(self.REAL_FILE, 'rb') as fp: data = TorrentFileParser(fp, True).parse() self.assertIsInstance(data, collections.OrderedDict) def test_parse_correctness(self): - data = parse_torrent_file(self.TEST_FILENAME) + data = parse_torrent_file(self.REAL_FILE) self.assertIn(['udp://tracker.publicbt.com:80/announce'], data['announce-list']) self.assertEqual(data['creation date'], 1409254242) def test_parse_two_times(self): - with open(self.TEST_FILENAME, 'rb') as fp: + with open(self.REAL_FILE, 'rb') as fp: parser = TorrentFileParser(fp) data = parser.parse() self.assertIn(['udp://tracker.publicbt.com:80/announce'], @@ -41,3 +46,11 @@ class Test(unittest.TestCase): self.assertIn(['udp://tracker.publicbt.com:80/announce'], data['announce-list']) self.assertEqual(data['creation date'], 1409254242) + + def test_int_is_negative(self): + data = parse_torrent_file(self.NEG_FILE) + self.assertEqual(data['neg'], -1) + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/tests/testfiles/neg.torrent b/tests/testfiles/neg.torrent new file mode 100644 index 0000000..3da5fb1 --- /dev/null +++ b/tests/testfiles/neg.torrent @@ -0,0 +1 @@ +d3:negi-1ee \ No newline at end of file diff --git a/test.torrent b/tests/testfiles/real.torrent similarity index 100% rename from test.torrent rename to tests/testfiles/real.torrent diff --git a/torrent_parser.py b/torrent_parser.py index 5ecd6ae..392f2ab 100644 --- a/torrent_parser.py +++ b/torrent_parser.py @@ -21,6 +21,7 @@ import collections import io import json import sys +import warnings try: FileNotFoundError @@ -29,6 +30,14 @@ except NameError: # noinspection PyShadowingBuiltins FileNotFoundError = IOError +try: + # noinspection PyPackageRequirements + from chardet import detect as _detect +except ImportError: + def _detect(_): + warnings.warn("No chardet module installed, encoding will be utf-8") + return {'encoding': 'utf-8', 'confidence': 1} + __all__ = [ 'InvalidTorrentDataException', 'parse_torrent_file', @@ -38,6 +47,10 @@ __all__ = [ __version__ = '0.1.3' +def detect(content): + return _detect(content)['encoding'] + + class InvalidTorrentDataException(Exception): def __init__(self, pos, msg=None): msg = msg or "Invalid torrent format when read at pos {pos}" @@ -48,6 +61,7 @@ class InvalidTorrentDataException(Exception): class __EndCls(object): pass + _END = __EndCls() @@ -77,7 +91,8 @@ class TorrentFileParser(object): """ :param fp: a **binary** file-like object to parse, which means need 'b' mode when use built-in open function - :param encoding: file content encoding, default utf-8 + :param encoding: file content encoding, default utf-8, use 'auto' to + enable charset auto detection ('chardet' package should be installed) :param use_ordered_dict: Use collections.OrderedDict as dict container default False, which mean use built-in dict """ @@ -127,6 +142,7 @@ class TorrentFileParser(object): def _seek_back(self, count): self._content.seek(-count, 1) + self._pos = self._pos - count def _restart(self): self._content.seek(0, 0) @@ -168,20 +184,26 @@ class TorrentFileParser(object): def _next_int(self, end=END_INDICATOR): value = 0 char = self._read_byte(1) + neg = False while char != end: - # noinspection PyTypeChecker - if not b'0' <= char <= b'9': + if not neg and char == b'-': + neg = True + elif not b'0' <= char <= b'9': raise InvalidTorrentDataException(self._pos - 1) - value = value * 10 + int(char) - int(b'0') + else: + value = value * 10 + int(char) - int(b'0') char = self._read_byte(1) - return value + return -value if neg else value def _next_string(self, decode=True): length = self._next_int(b':') raw = self._read_byte(length) if decode: + encoding = self._encoding + if encoding == 'auto': + encoding = detect(raw) try: - string = raw.decode(self._encoding) + string = raw.decode(encoding) except UnicodeDecodeError as e: raise InvalidTorrentDataException( self._pos - length + e.start, @@ -289,5 +311,6 @@ def __main(): print(data) + if __name__ == '__main__': __main()