From 2c1cab4a2ee2d8725648a70dd7774c5d2de20b71 Mon Sep 17 00:00:00 2001 From: JK Date: Thu, 29 Jan 2026 12:39:48 +0900 Subject: [PATCH 1/4] =?UTF-8?q?feat(confluence-mdx):=20page.v1.yaml?= =?UTF-8?q?=EC=9D=84=20=ED=99=9C=EC=9A=A9=ED=95=9C=20=EC=A0=95=ED=99=95?= =?UTF-8?q?=ED=95=9C=20pageId=20=EA=B8=B0=EB=B0=98=20=EB=A7=81=ED=81=AC=20?= =?UTF-8?q?=EC=83=9D=EC=84=B1?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description - page.v1.yaml의 body.view HTML을 파싱하여 링크 텍스트 → pageId 매핑을 생성하는 `build_link_mapping` 함수 추가 - `` 처리 로직에서 링크 매핑을 조회하여 정확한 pageId 기반 Confluence URL 생성 - 외부 Confluence 링크(변환 범위 외 페이지)에 대해 space overview URL 대신 특정 페이지 URL 생성 - pageId 있음: `https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id}` - pageId 없음 + space_key 있음: `https://querypie.atlassian.net/wiki/spaces/{space_key}/overview` (기존 동작 유지) - 둘 다 없음: `#link-error` (기존 동작 유지) ### Background #576에서 구현한 외부 링크 처리는 space overview URL만 생성했습니다. page.xhtml에는 pageId가 포함되지 않지만, page.v1.yaml의 body.view HTML에는 `data-linked-resource-id` 속성으로 pageId가 포함되어 있어 이를 활용합니다. ### Test Results 테스트 케이스 1844969501: - ✓ QueryPie Architecture → pages/400064797 - ✓ Advanced Environment Setup → pages/887947577 - ✓ Advanced Integration Guide → pages/841449834 - ✓ 릴리스 버전 별 문서 → pages/841351486 - ✓ Troubleshooting → pages/920486841 ## Related tickets & links - #577 Co-Authored-By: Claude Opus 4.5 --- .../bin/confluence_xhtml_to_markdown.py | 69 +++++++++++++++++-- .../tests/testcases/1844969501/expected.mdx | 10 +-- 2 files changed, 70 insertions(+), 9 deletions(-) diff --git a/confluence-mdx/bin/confluence_xhtml_to_markdown.py b/confluence-mdx/bin/confluence_xhtml_to_markdown.py index d078940b..93af9d28 100755 --- a/confluence-mdx/bin/confluence_xhtml_to_markdown.py +++ b/confluence-mdx/bin/confluence_xhtml_to_markdown.py @@ -143,6 +143,7 @@ def as_markdown(self, caption: Optional[str] = None, width: Optional[str] = None PAGES_BY_ID: PagesDict = {} GLOBAL_PAGE_V1: Optional[PageV1] = None GLOBAL_ATTACHMENTS: List[Attachment] = [] +GLOBAL_LINK_MAPPING: Dict[str, str] = {} # Mapping of link text -> pageId from page.v1.yaml # Hidden characters for text cleaning HIDDEN_CHARACTERS = { @@ -424,6 +425,53 @@ def load_page_v1_yaml(yaml_path: str) -> Optional[PageV1]: return None +def build_link_mapping(page_v1: Optional[PageV1]) -> Dict[str, str]: + """ + Build a mapping of link text -> pageId from page.v1.yaml body.view HTML + + This function parses the rendered HTML in page.v1.yaml's body.view section + to extract links with their pageIds. This allows us to generate accurate + Confluence URLs for external links (links to pages outside the current conversion scope). + + Args: + page_v1 (Optional[PageV1]): The page.v1.yaml data structure + + Returns: + Dict[str, str]: Mapping of link text to pageId + """ + link_map = {} + + if not page_v1: + logging.warning("No page.v1 data available to build link mapping") + return link_map + + try: + view_html = page_v1.get('body', {}).get('view', {}).get('value', '') + + if not view_html: + logging.warning("No body.view HTML found in page.v1.yaml") + return link_map + + soup = BeautifulSoup(view_html, 'html.parser') + + # Find all links with data-linked-resource-id attribute + for link in soup.find_all('a', {'data-linked-resource-id': True}): + text = link.get_text() + page_id = link.get('data-linked-resource-id', '') + resource_type = link.get('data-linked-resource-type', '') + + if text and page_id and resource_type == 'page': + link_map[text] = page_id + logging.debug(f"Link mapping: '{text}' -> pageId {page_id}") + + logging.info(f"Built link mapping with {len(link_map)} entries") + + except Exception as e: + logging.error(f"Error building link mapping from page.v1.yaml: {e}") + + return link_map + + def backtick_curly_braces(text): """ Wrap text embraced by curly braces with backticks. @@ -888,11 +936,20 @@ def convert_recursively(self, node): # Internal link - use relative path href = relative_path_to_titled_page(target_title) else: - # External link - generate Confluence URL - if space_key: - # Use space overview URL since we don't have page_id + # External link - try to get pageId from link mapping + # We need to get the link_body early to look up pageId + link_body_node = node.find('ac:link-body') + current_link_body = SingleLineParser(link_body_node).as_markdown if link_body_node else link_body + page_id = GLOBAL_LINK_MAPPING.get(current_link_body) + + if page_id and space_key: + # Generate accurate URL with pageId + href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id}' + logging.info(f"Generated external Confluence link with pageId for '{current_link_body}' (title: '{target_title}'): {href}") + elif space_key: + # Fallback to space overview URL if no pageId found href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview' - logging.info(f"Generated external Confluence space link for title '{target_title}' in space '{space_key}': {href}") + logging.warning(f"No pageId found for '{current_link_body}', using space overview for '{target_title}' in space '{space_key}': {href}") else: # No space key - show simple error message href = '#link-error' @@ -2081,6 +2138,10 @@ def main(): page_v1: Optional[PageV1] = load_page_v1_yaml(os.path.join(input_dir, 'page.v1.yaml')) set_page_v1(page_v1) + # Build link mapping from page.v1.yaml for external link pageId resolution + global GLOBAL_LINK_MAPPING + GLOBAL_LINK_MAPPING = build_link_mapping(page_v1) + converter = ConfluenceToMarkdown(html_content) converter.load_attachments(input_dir, output_dir, args.public_dir) markdown_content = converter.as_markdown() diff --git a/confluence-mdx/tests/testcases/1844969501/expected.mdx b/confluence-mdx/tests/testcases/1844969501/expected.mdx index a303eec6..20a3a3ad 100644 --- a/confluence-mdx/tests/testcases/1844969501/expected.mdx +++ b/confluence-mdx/tests/testcases/1844969501/expected.mdx @@ -19,11 +19,11 @@ title: '지원' 기술지원을 위한 [Confluence Space](https://querypie.atlassian.net/wiki/spaces/QCP/overview) 에서 상세한 기술지원 자료를 확인할 수 있습니다. -* [QueryPie Architecture](https://querypie.atlassian.net/wiki/spaces/QCP/overview) -* [Advanced Environment Setup](https://querypie.atlassian.net/wiki/spaces/QCP/overview) -* [Advanced Integration Guide](https://querypie.atlassian.net/wiki/spaces/QCP/overview) -* [릴리스 버전 별 문서](https://querypie.atlassian.net/wiki/spaces/QCP/overview) -* [Troubleshooting](https://querypie.atlassian.net/wiki/spaces/QCP/overview) +* [QueryPie Architecture](https://querypie.atlassian.net/wiki/spaces/QCP/pages/400064797) +* [Advanced Environment Setup](https://querypie.atlassian.net/wiki/spaces/QCP/pages/887947577) +* [Advanced Integration Guide](https://querypie.atlassian.net/wiki/spaces/QCP/pages/841449834) +* [릴리스 버전 별 문서](https://querypie.atlassian.net/wiki/spaces/QCP/pages/841351486) +* [Troubleshooting](https://querypie.atlassian.net/wiki/spaces/QCP/pages/920486841) #### Product Demo - YouTube From ea101dab07aab7f2fa5b067bffdf167e9e17b778 Mon Sep 17 00:00:00 2001 From: JK Date: Thu, 29 Jan 2026 12:47:51 +0900 Subject: [PATCH 2/4] =?UTF-8?q?refactor(confluence-mdx):=20=EC=99=B8?= =?UTF-8?q?=EB=B6=80=20=EB=A7=81=ED=81=AC=20=EC=B2=98=EB=A6=AC=20=EB=A1=9C?= =?UTF-8?q?=EC=A7=81=EC=9D=84=20resolve=5Fexternal=5Flink=20=ED=95=A8?= =?UTF-8?q?=EC=88=98=EB=A1=9C=20=EB=B6=84=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description - 외부 링크 URL 생성 로직을 `resolve_external_link` 함수로 추출하여 코드 간결성 향상 - `convert_recursively` 메서드의 복잡도 감소 (18줄 → 3줄) - 링크 생성 로직의 재사용성 및 테스트 용이성 개선 ### Changes - `resolve_external_link(link_text, space_key, target_title)` 함수 추가 - pageId 조회 및 URL 생성 로직 캡슐화 - 명확한 fallback 전략 (pageId → space overview → error link) - `convert_recursively`에서 외부 링크 처리 부분 간소화 ### Benefits - 코드 가독성 향상 - 단일 책임 원칙 적용 - 유지보수 용이성 증가 Co-Authored-By: Claude Opus 4.5 --- .../bin/confluence_xhtml_to_markdown.py | 58 ++++++++++++++----- 1 file changed, 42 insertions(+), 16 deletions(-) diff --git a/confluence-mdx/bin/confluence_xhtml_to_markdown.py b/confluence-mdx/bin/confluence_xhtml_to_markdown.py index 93af9d28..261de86e 100755 --- a/confluence-mdx/bin/confluence_xhtml_to_markdown.py +++ b/confluence-mdx/bin/confluence_xhtml_to_markdown.py @@ -472,6 +472,45 @@ def build_link_mapping(page_v1: Optional[PageV1]) -> Dict[str, str]: return link_map +def resolve_external_link(link_text: str, space_key: str, target_title: str) -> str: + """ + Resolve external Confluence link URL using pageId from global link mapping + + This function attempts to generate an accurate Confluence URL for external links + (links to pages outside the current conversion scope) by looking up the pageId + from GLOBAL_LINK_MAPPING. If pageId is not found, it falls back to space overview + or error link. + + Args: + link_text (str): The link body text to match in GLOBAL_LINK_MAPPING + space_key (str): The Confluence space key + target_title (str): The target page title (for logging purposes) + + Returns: + str: The resolved URL in one of these formats: + - With pageId: https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id} + - Without pageId but with space_key: https://querypie.atlassian.net/wiki/spaces/{space_key}/overview + - Without space_key: #link-error + """ + page_id = GLOBAL_LINK_MAPPING.get(link_text) + + if page_id and space_key: + # Generate accurate URL with pageId + href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id}' + logging.info(f"Generated external Confluence link with pageId for '{link_text}' (title: '{target_title}'): {href}") + return href + elif space_key: + # Fallback to space overview URL if no pageId found + href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview' + logging.warning(f"No pageId found for '{link_text}', using space overview for '{target_title}' in space '{space_key}': {href}") + return href + else: + # No space key - show simple error message + href = '#link-error' + logging.warning(f"No space key found for external link to '{target_title}', using error anchor: {href}") + return href + + def backtick_curly_braces(text): """ Wrap text embraced by curly braces with backticks. @@ -936,24 +975,11 @@ def convert_recursively(self, node): # Internal link - use relative path href = relative_path_to_titled_page(target_title) else: - # External link - try to get pageId from link mapping - # We need to get the link_body early to look up pageId + # External link - resolve using pageId from link mapping + # Get link_body explicitly to ensure we have the correct text for lookup link_body_node = node.find('ac:link-body') current_link_body = SingleLineParser(link_body_node).as_markdown if link_body_node else link_body - page_id = GLOBAL_LINK_MAPPING.get(current_link_body) - - if page_id and space_key: - # Generate accurate URL with pageId - href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id}' - logging.info(f"Generated external Confluence link with pageId for '{current_link_body}' (title: '{target_title}'): {href}") - elif space_key: - # Fallback to space overview URL if no pageId found - href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview' - logging.warning(f"No pageId found for '{current_link_body}', using space overview for '{target_title}' in space '{space_key}': {href}") - else: - # No space key - show simple error message - href = '#link-error' - logging.warning(f"No space key found for external link to '{target_title}', using error anchor: {href}") + href = resolve_external_link(current_link_body, space_key, target_title) self.markdown_lines.append(f'[{link_body}{decoded_anchor}]({href}{lowercased_fragment})') elif node.name in ['ri:page']: From 57bd9e2ba1c899780f642aab15e09207f36f7db6 Mon Sep 17 00:00:00 2001 From: JK Date: Thu, 29 Jan 2026 12:50:16 +0900 Subject: [PATCH 3/4] =?UTF-8?q?refactor(confluence-mdx):=20ac:link=20?= =?UTF-8?q?=EB=B3=80=ED=99=98=20=EB=A1=9C=EC=A7=81=EC=9D=84=20convert=5Fac?= =?UTF-8?q?=5Flink=20=EB=A9=94=EC=84=9C=EB=93=9C=EB=A1=9C=20=EB=B6=84?= =?UTF-8?q?=EB=A6=AC?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description - SingleLineParser 클래스에 `convert_ac_link` 메서드 추가 - `convert_recursively` 메서드의 ac:link 처리 블록을 독립적인 메서드로 추출 - 코드 복잡도 감소 및 가독성 향상 (54줄 → 3줄) ### Changes - `convert_ac_link(node: Tag) -> str` 메서드 추가 - ac:link 노드를 markdown 링크 형식으로 변환 - 내부 링크, 외부 링크, space 링크 처리 로직 캡슐화 - anchor fragment 처리 포함 - `convert_recursively`에서 ac:link 처리 부분 간소화 ### Benefits - **관심사 분리**: ac:link 처리 로직이 독립적인 메서드로 관리됨 - **코드 가독성**: convert_recursively 메서드가 더욱 간결하고 이해하기 쉬워짐 - **유지보수성**: ac:link 관련 수정 시 한 곳만 변경 - **테스트 용이성**: 메서드 단위 테스트 작성 가능 ### Test Results - ✅ 테스트 케이스 1844969501 검증 완료 (출력 동일) - ✅ pytest 전체 테스트 통과 (55 passed) Co-Authored-By: Claude Opus 4.5 --- .../bin/confluence_xhtml_to_markdown.py | 122 ++++++++++-------- 1 file changed, 68 insertions(+), 54 deletions(-) diff --git a/confluence-mdx/bin/confluence_xhtml_to_markdown.py b/confluence-mdx/bin/confluence_xhtml_to_markdown.py index 261de86e..7fbe56cc 100755 --- a/confluence-mdx/bin/confluence_xhtml_to_markdown.py +++ b/confluence-mdx/bin/confluence_xhtml_to_markdown.py @@ -928,60 +928,9 @@ def convert_recursively(self, node): link_text = readable_anchor_text self.markdown_lines.append(f"[{link_text}]({href})") elif node.name in ['ac:link']: - """ - - - Slack DM 개인 알림 사용하기 - - - - My Dashboard - - """ - link_body = '(ERROR: Link body not found)' - anchor = node.get('anchor', '') - if anchor: - decoded_anchor = ' | ' + unquote(anchor) - lowercased_fragment = '#' + anchor.lower() - else: - decoded_anchor = '' - lowercased_fragment = '' - - href = '#' - ri_page = None - ri_space = None - for child in node.children: - if isinstance(child, Tag) and child.name == 'ac:link-body': - link_body = SingleLineParser(child).as_markdown - if isinstance(child, Tag) and child.name == 'ri:space': - # Handle space links: - ri_space = child - space_key = child.get('space-key', '') - if space_key: - href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview' - logging.info(f"Generated Confluence space overview link for space '{space_key}': {href}") - else: - href = '#link-error' - logging.warning(f"No space key found in ri:space tag, using error anchor: {href}") - if isinstance(child, Tag) and child.name == 'ri:page': - ri_page = child - target_title = child.get('content-title', '') - space_key = child.get('space-key', '') - - # Check if the target page is in pages.yaml - target_page = PAGES_BY_TITLE.get(target_title) - - if target_page: - # Internal link - use relative path - href = relative_path_to_titled_page(target_title) - else: - # External link - resolve using pageId from link mapping - # Get link_body explicitly to ensure we have the correct text for lookup - link_body_node = node.find('ac:link-body') - current_link_body = SingleLineParser(link_body_node).as_markdown if link_body_node else link_body - href = resolve_external_link(current_link_body, space_key, target_title) - - self.markdown_lines.append(f'[{link_body}{decoded_anchor}]({href}{lowercased_fragment})') + # Convert ac:link node to markdown link + markdown_link = self.convert_ac_link(node) + self.markdown_lines.append(markdown_link) elif node.name in ['ri:page']: content_title = node.get('content-title', '#') self.markdown_lines.append(content_title) @@ -1108,6 +1057,71 @@ def markdown_of_children(self, node): markdown.append(SingleLineParser(child).as_markdown) return ''.join(markdown) + def convert_ac_link(self, node: Tag) -> str: + """ + Convert ac:link node to markdown link format + + Handles various types of Confluence links: + - Internal page links (ri:page with content in pages.yaml) + - External page links (ri:page outside conversion scope) + - Space links (ri:space) + + Example XHTML: + + + Link Text + + + Returns: + str: Markdown link in format [link_body](href) + """ + link_body = '(ERROR: Link body not found)' + anchor = node.get('anchor', '') + + # Process anchor fragment + if anchor: + decoded_anchor = ' | ' + unquote(anchor) + lowercased_fragment = '#' + anchor.lower() + else: + decoded_anchor = '' + lowercased_fragment = '' + + href = '#' + + # Process child nodes to extract link body and determine href + for child in node.children: + if isinstance(child, Tag) and child.name == 'ac:link-body': + link_body = SingleLineParser(child).as_markdown + + elif isinstance(child, Tag) and child.name == 'ri:space': + # Handle space links: + space_key = child.get('space-key', '') + if space_key: + href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview' + logging.info(f"Generated Confluence space overview link for space '{space_key}': {href}") + else: + href = '#link-error' + logging.warning(f"No space key found in ri:space tag, using error anchor: {href}") + + elif isinstance(child, Tag) and child.name == 'ri:page': + target_title = child.get('content-title', '') + space_key = child.get('space-key', '') + + # Check if the target page is in pages.yaml + target_page = PAGES_BY_TITLE.get(target_title) + + if target_page: + # Internal link - use relative path + href = relative_path_to_titled_page(target_title) + else: + # External link - resolve using pageId from link mapping + # Get link_body explicitly to ensure we have the correct text for lookup + link_body_node = node.find('ac:link-body') + current_link_body = SingleLineParser(link_body_node).as_markdown if link_body_node else link_body + href = resolve_external_link(current_link_body, space_key, target_title) + + return f'[{link_body}{decoded_anchor}]({href}{lowercased_fragment})' + def convert_inline_image(self, node): """ Process Confluence-specific image tags and convert them to Markdown format. From 31af07a23a3acdfcee50d1fee86d99c53af90c16 Mon Sep 17 00:00:00 2001 From: JK Date: Thu, 29 Jan 2026 13:03:36 +0900 Subject: [PATCH 4/4] =?UTF-8?q?docs(confluence-mdx):=20convert=5Fac=5Flink?= =?UTF-8?q?=20=EB=A9=94=EC=84=9C=EB=93=9C=EC=97=90=20=EB=A7=81=ED=81=AC=20?= =?UTF-8?q?=EC=9C=A0=ED=98=95=EB=B3=84=20=EC=98=88=EC=8B=9C=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Description - `convert_ac_link` 메서드의 docstring에 6가지 링크 유형별 상세 예시 추가 - 각 유형별로 입력 XHTML과 출력 Markdown을 명확하게 문서화 ### 추가된 예시 1. **Internal Page Link**: pages.yaml에 있는 페이지 → 상대 경로 2. **External Page Link with pageId**: page.v1.yaml에서 pageId 조회 성공 → 정확한 페이지 URL 3. **External Page Link without pageId**: pageId 조회 실패 → space overview URL 4. **Space Link**: ri:space 태그 → space overview URL 5. **Link with Anchor Fragment**: anchor 속성 포함 → fragment 추가 6. **Error Case**: space key 없음 → #link-error ### Benefits - 코드 사용자가 각 링크 유형의 변환 결과를 명확하게 이해 가능 - 유지보수 시 예상 동작을 쉽게 파악 - 새로운 개발자의 온보딩 용이성 향상 Co-Authored-By: Claude Opus 4.5 --- .../bin/confluence_xhtml_to_markdown.py | 70 ++++++++++++++++--- 1 file changed, 59 insertions(+), 11 deletions(-) diff --git a/confluence-mdx/bin/confluence_xhtml_to_markdown.py b/confluence-mdx/bin/confluence_xhtml_to_markdown.py index 7fbe56cc..8f93d687 100755 --- a/confluence-mdx/bin/confluence_xhtml_to_markdown.py +++ b/confluence-mdx/bin/confluence_xhtml_to_markdown.py @@ -1061,19 +1061,67 @@ def convert_ac_link(self, node: Tag) -> str: """ Convert ac:link node to markdown link format - Handles various types of Confluence links: - - Internal page links (ri:page with content in pages.yaml) - - External page links (ri:page outside conversion scope) - - Space links (ri:space) - - Example XHTML: - - - Link Text - + Handles various types of Confluence links and generates appropriate markdown output: + + 1. Internal Page Link (target page in pages.yaml): + XHTML: + + + User Guide + + Output: + [User Guide](../../user-guide) + + 2. External Page Link with pageId (target page in page.v1.yaml link mapping): + XHTML: + + + QueryPie Architecture + + Output: + [QueryPie Architecture](https://querypie.atlassian.net/wiki/spaces/QCP/pages/400064797) + + 3. External Page Link without pageId (fallback to space overview): + XHTML: + + + Unknown Page + + Output: + [Unknown Page](https://querypie.atlassian.net/wiki/spaces/QCP/overview) + + 4. Space Link: + XHTML: + + + Confluence Space + + Output: + [Confluence Space](https://querypie.atlassian.net/wiki/spaces/QCP/overview) + + 5. Link with Anchor Fragment: + XHTML: + + + My Dashboard + + Output: + [My Dashboard | section-name](../../my-dashboard#section-name) + + 6. Error Case (no space key): + XHTML: + + + Missing Page + + Output: + [Missing Page](#link-error) + + Args: + node (Tag): BeautifulSoup Tag object representing ac:link node Returns: - str: Markdown link in format [link_body](href) + str: Markdown link in format [link_body](href) or [link_body | anchor](href#fragment) """ link_body = '(ERROR: Link body not found)' anchor = node.get('anchor', '')