diff --git a/src/cluecode/copyrights.py b/src/cluecode/copyrights.py index 6d17467acf..df690953b3 100644 --- a/src/cluecode/copyrights.py +++ b/src/cluecode/copyrights.py @@ -4282,35 +4282,36 @@ def is_candidate(prepared_line): return False if is_only_digit_and_punct(prepared_line): - if TRACE: - logger_debug(f'is_candidate: is_only_digit_and_punct:\n{prepared_line!r}') - return False if gibberish_detector.detect_gibberish(prepared_line): - if TRACE: - logger_debug(f'is_candidate: gibberish_detector.detect_gibberish:\n{prepared_line!r}') return False + lowered = prepared_line.lower() + + # ---------------------------------------------------------- + # Ignore (c) ONLY when it appears inside a URL path + # ---------------------------------------------------------- + if '(c)' in lowered: + # remove spaces to reconstruct possible broken URL + compact = lowered.replace(' ', '') + + # match http://.../(c)/... + if re.search(r'https?://[^ ]*\(c\)[^ ]*', compact): + return False + + # ---------------------------------------------------------- + # Original logic continues + # ---------------------------------------------------------- if copyrights_hint.years(prepared_line): return True - else: - pass - prepared_line = prepared_line.lower() + for marker in copyrights_hint.statement_markers: - if marker in prepared_line: + if marker in lowered: return True + return False -def is_inside_statement( - chars_only_line, - markers=('copyright', 'copyrights', 'copyrightby',) + copyrights_hint.all_years, -): - """ - Return True if a line ends with some strings that indicate we are still - inside a statement. - """ - return chars_only_line and chars_only_line.endswith(markers) def is_end_of_statement(chars_only_line): diff --git a/tests/cluecode/test_copyrights_basic.py b/tests/cluecode/test_copyrights_basic.py index 1fbafcf548..7f30031788 100644 --- a/tests/cluecode/test_copyrights_basic.py +++ b/tests/cluecode/test_copyrights_basic.py @@ -14,7 +14,26 @@ from commoncode.testcase import FileBasedTesting from cluecode import copyrights from cluecode.copyrights import prepare_text_line -from cluecode.copyrights import remove_non_chars +from cluecode import copyrights +from cluecode.copyrights import prepare_text_line, remove_non_chars + +def test_copyright_symbol_inside_url_is_ignored(): + text = "See http://example.com/(c)/path for more information." + + prepped = prepare_text_line(text) + + # sanity check + assert '(c)' in prepped + + # URLs containing (c) should NOT be copyright candidates + assert not copyrights.is_candidate(prepped) + +def test_copyright_with_url_is_still_candidate(): + text = "Copyright (c) 2000 Foo, http://example.com" + + prepped = prepare_text_line(text) + + assert copyrights.is_candidate(prepped) class TestTextPreparation(FileBasedTesting):