diff --git a/confluence-mdx/bin/confluence_xhtml_to_markdown.py b/confluence-mdx/bin/confluence_xhtml_to_markdown.py
index d078940b..8f93d687 100755
--- a/confluence-mdx/bin/confluence_xhtml_to_markdown.py
+++ b/confluence-mdx/bin/confluence_xhtml_to_markdown.py
@@ -143,6 +143,7 @@ def as_markdown(self, caption: Optional[str] = None, width: Optional[str] = None
PAGES_BY_ID: PagesDict = {}
GLOBAL_PAGE_V1: Optional[PageV1] = None
GLOBAL_ATTACHMENTS: List[Attachment] = []
+GLOBAL_LINK_MAPPING: Dict[str, str] = {} # Mapping of link text -> pageId from page.v1.yaml
# Hidden characters for text cleaning
HIDDEN_CHARACTERS = {
@@ -424,6 +425,92 @@ def load_page_v1_yaml(yaml_path: str) -> Optional[PageV1]:
return None
+def build_link_mapping(page_v1: Optional[PageV1]) -> Dict[str, str]:
+ """
+ Build a mapping of link text -> pageId from page.v1.yaml body.view HTML
+
+ This function parses the rendered HTML in page.v1.yaml's body.view section
+ to extract links with their pageIds. This allows us to generate accurate
+ Confluence URLs for external links (links to pages outside the current conversion scope).
+
+ Args:
+ page_v1 (Optional[PageV1]): The page.v1.yaml data structure
+
+ Returns:
+ Dict[str, str]: Mapping of link text to pageId
+ """
+ link_map = {}
+
+ if not page_v1:
+ logging.warning("No page.v1 data available to build link mapping")
+ return link_map
+
+ try:
+ view_html = page_v1.get('body', {}).get('view', {}).get('value', '')
+
+ if not view_html:
+ logging.warning("No body.view HTML found in page.v1.yaml")
+ return link_map
+
+ soup = BeautifulSoup(view_html, 'html.parser')
+
+ # Find all links with data-linked-resource-id attribute
+ for link in soup.find_all('a', {'data-linked-resource-id': True}):
+ text = link.get_text()
+ page_id = link.get('data-linked-resource-id', '')
+ resource_type = link.get('data-linked-resource-type', '')
+
+ if text and page_id and resource_type == 'page':
+ link_map[text] = page_id
+ logging.debug(f"Link mapping: '{text}' -> pageId {page_id}")
+
+ logging.info(f"Built link mapping with {len(link_map)} entries")
+
+ except Exception as e:
+ logging.error(f"Error building link mapping from page.v1.yaml: {e}")
+
+ return link_map
+
+
+def resolve_external_link(link_text: str, space_key: str, target_title: str) -> str:
+ """
+ Resolve external Confluence link URL using pageId from global link mapping
+
+ This function attempts to generate an accurate Confluence URL for external links
+ (links to pages outside the current conversion scope) by looking up the pageId
+ from GLOBAL_LINK_MAPPING. If pageId is not found, it falls back to space overview
+ or error link.
+
+ Args:
+ link_text (str): The link body text to match in GLOBAL_LINK_MAPPING
+ space_key (str): The Confluence space key
+ target_title (str): The target page title (for logging purposes)
+
+ Returns:
+ str: The resolved URL in one of these formats:
+ - With pageId: https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id}
+ - Without pageId but with space_key: https://querypie.atlassian.net/wiki/spaces/{space_key}/overview
+ - Without space_key: #link-error
+ """
+ page_id = GLOBAL_LINK_MAPPING.get(link_text)
+
+ if page_id and space_key:
+ # Generate accurate URL with pageId
+ href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/pages/{page_id}'
+ logging.info(f"Generated external Confluence link with pageId for '{link_text}' (title: '{target_title}'): {href}")
+ return href
+ elif space_key:
+ # Fallback to space overview URL if no pageId found
+ href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview'
+ logging.warning(f"No pageId found for '{link_text}', using space overview for '{target_title}' in space '{space_key}': {href}")
+ return href
+ else:
+ # No space key - show simple error message
+ href = '#link-error'
+ logging.warning(f"No space key found for external link to '{target_title}', using error anchor: {href}")
+ return href
+
+
def backtick_curly_braces(text):
"""
Wrap text embraced by curly braces with backticks.
@@ -841,64 +928,9 @@ def convert_recursively(self, node):
link_text = readable_anchor_text
self.markdown_lines.append(f"[{link_text}]({href})")
elif node.name in ['ac:link']:
- """
-
-
- Slack DM 개인 알림 사용하기
-
-
-
- My Dashboard
-
- """
- link_body = '(ERROR: Link body not found)'
- anchor = node.get('anchor', '')
- if anchor:
- decoded_anchor = ' | ' + unquote(anchor)
- lowercased_fragment = '#' + anchor.lower()
- else:
- decoded_anchor = ''
- lowercased_fragment = ''
-
- href = '#'
- ri_page = None
- ri_space = None
- for child in node.children:
- if isinstance(child, Tag) and child.name == 'ac:link-body':
- link_body = SingleLineParser(child).as_markdown
- if isinstance(child, Tag) and child.name == 'ri:space':
- # Handle space links:
- ri_space = child
- space_key = child.get('space-key', '')
- if space_key:
- href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview'
- logging.info(f"Generated Confluence space overview link for space '{space_key}': {href}")
- else:
- href = '#link-error'
- logging.warning(f"No space key found in ri:space tag, using error anchor: {href}")
- if isinstance(child, Tag) and child.name == 'ri:page':
- ri_page = child
- target_title = child.get('content-title', '')
- space_key = child.get('space-key', '')
-
- # Check if the target page is in pages.yaml
- target_page = PAGES_BY_TITLE.get(target_title)
-
- if target_page:
- # Internal link - use relative path
- href = relative_path_to_titled_page(target_title)
- else:
- # External link - generate Confluence URL
- if space_key:
- # Use space overview URL since we don't have page_id
- href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview'
- logging.info(f"Generated external Confluence space link for title '{target_title}' in space '{space_key}': {href}")
- else:
- # No space key - show simple error message
- href = '#link-error'
- logging.warning(f"No space key found for external link to '{target_title}', using error anchor: {href}")
-
- self.markdown_lines.append(f'[{link_body}{decoded_anchor}]({href}{lowercased_fragment})')
+ # Convert ac:link node to markdown link
+ markdown_link = self.convert_ac_link(node)
+ self.markdown_lines.append(markdown_link)
elif node.name in ['ri:page']:
content_title = node.get('content-title', '#')
self.markdown_lines.append(content_title)
@@ -1025,6 +1057,119 @@ def markdown_of_children(self, node):
markdown.append(SingleLineParser(child).as_markdown)
return ''.join(markdown)
+ def convert_ac_link(self, node: Tag) -> str:
+ """
+ Convert ac:link node to markdown link format
+
+ Handles various types of Confluence links and generates appropriate markdown output:
+
+ 1. Internal Page Link (target page in pages.yaml):
+ XHTML:
+
+
+ User Guide
+
+ Output:
+ [User Guide](../../user-guide)
+
+ 2. External Page Link with pageId (target page in page.v1.yaml link mapping):
+ XHTML:
+
+
+ QueryPie Architecture
+
+ Output:
+ [QueryPie Architecture](https://querypie.atlassian.net/wiki/spaces/QCP/pages/400064797)
+
+ 3. External Page Link without pageId (fallback to space overview):
+ XHTML:
+
+
+ Unknown Page
+
+ Output:
+ [Unknown Page](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
+
+ 4. Space Link:
+ XHTML:
+
+
+ Confluence Space
+
+ Output:
+ [Confluence Space](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
+
+ 5. Link with Anchor Fragment:
+ XHTML:
+
+
+ My Dashboard
+
+ Output:
+ [My Dashboard | section-name](../../my-dashboard#section-name)
+
+ 6. Error Case (no space key):
+ XHTML:
+
+
+ Missing Page
+
+ Output:
+ [Missing Page](#link-error)
+
+ Args:
+ node (Tag): BeautifulSoup Tag object representing ac:link node
+
+ Returns:
+ str: Markdown link in format [link_body](href) or [link_body | anchor](href#fragment)
+ """
+ link_body = '(ERROR: Link body not found)'
+ anchor = node.get('anchor', '')
+
+ # Process anchor fragment
+ if anchor:
+ decoded_anchor = ' | ' + unquote(anchor)
+ lowercased_fragment = '#' + anchor.lower()
+ else:
+ decoded_anchor = ''
+ lowercased_fragment = ''
+
+ href = '#'
+
+ # Process child nodes to extract link body and determine href
+ for child in node.children:
+ if isinstance(child, Tag) and child.name == 'ac:link-body':
+ link_body = SingleLineParser(child).as_markdown
+
+ elif isinstance(child, Tag) and child.name == 'ri:space':
+ # Handle space links:
+ space_key = child.get('space-key', '')
+ if space_key:
+ href = f'https://querypie.atlassian.net/wiki/spaces/{space_key}/overview'
+ logging.info(f"Generated Confluence space overview link for space '{space_key}': {href}")
+ else:
+ href = '#link-error'
+ logging.warning(f"No space key found in ri:space tag, using error anchor: {href}")
+
+ elif isinstance(child, Tag) and child.name == 'ri:page':
+ target_title = child.get('content-title', '')
+ space_key = child.get('space-key', '')
+
+ # Check if the target page is in pages.yaml
+ target_page = PAGES_BY_TITLE.get(target_title)
+
+ if target_page:
+ # Internal link - use relative path
+ href = relative_path_to_titled_page(target_title)
+ else:
+ # External link - resolve using pageId from link mapping
+ # Get link_body explicitly to ensure we have the correct text for lookup
+ link_body_node = node.find('ac:link-body')
+ current_link_body = SingleLineParser(link_body_node).as_markdown if link_body_node else link_body
+ href = resolve_external_link(current_link_body, space_key, target_title)
+
+ return f'[{link_body}{decoded_anchor}]({href}{lowercased_fragment})'
+
def convert_inline_image(self, node):
"""
Process Confluence-specific image tags and convert them to Markdown format.
@@ -2081,6 +2226,10 @@ def main():
page_v1: Optional[PageV1] = load_page_v1_yaml(os.path.join(input_dir, 'page.v1.yaml'))
set_page_v1(page_v1)
+ # Build link mapping from page.v1.yaml for external link pageId resolution
+ global GLOBAL_LINK_MAPPING
+ GLOBAL_LINK_MAPPING = build_link_mapping(page_v1)
+
converter = ConfluenceToMarkdown(html_content)
converter.load_attachments(input_dir, output_dir, args.public_dir)
markdown_content = converter.as_markdown()
diff --git a/confluence-mdx/tests/testcases/1844969501/expected.mdx b/confluence-mdx/tests/testcases/1844969501/expected.mdx
index a303eec6..20a3a3ad 100644
--- a/confluence-mdx/tests/testcases/1844969501/expected.mdx
+++ b/confluence-mdx/tests/testcases/1844969501/expected.mdx
@@ -19,11 +19,11 @@ title: '지원'
기술지원을 위한 [Confluence Space](https://querypie.atlassian.net/wiki/spaces/QCP/overview) 에서 상세한 기술지원 자료를 확인할 수 있습니다.
-* [QueryPie Architecture](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
-* [Advanced Environment Setup](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
-* [Advanced Integration Guide](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
-* [릴리스 버전 별 문서](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
-* [Troubleshooting](https://querypie.atlassian.net/wiki/spaces/QCP/overview)
+* [QueryPie Architecture](https://querypie.atlassian.net/wiki/spaces/QCP/pages/400064797)
+* [Advanced Environment Setup](https://querypie.atlassian.net/wiki/spaces/QCP/pages/887947577)
+* [Advanced Integration Guide](https://querypie.atlassian.net/wiki/spaces/QCP/pages/841449834)
+* [릴리스 버전 별 문서](https://querypie.atlassian.net/wiki/spaces/QCP/pages/841351486)
+* [Troubleshooting](https://querypie.atlassian.net/wiki/spaces/QCP/pages/920486841)
#### Product Demo - YouTube