Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

builtin: implement, document and test url-parse #1715

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
urlmatch: define url_parse function
Define general parsing function that supports all Git URLs
including scp style URLs such as hostname:~user/repo.
Has the same interface as the URL normalization function
and uses the same data structures, facilitating its use.
It's adapted from the algorithm used to process URLs in connect.c,
so it should support the same inputs.

Signed-off-by: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>
  • Loading branch information
matheusmoreira committed Apr 28, 2024
commit 13b81b8aa06cfd63a5fd9d1acbaf21a8b388ff47
90 changes: 90 additions & 0 deletions urlmatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
#include "hex-ll.h"

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Ghanshyam Thakkar wrote (reply to this):

On Sun, 28 Apr 2024, Matheus Afonso Martins Moreira via GitGitGadget <gitgitgadget@gmail.com> wrote:
> From: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>
> 
> Define general parsing function that supports all Git URLs
> including scp style URLs such as hostname:~user/repo.
> Has the same interface as the URL normalization function
> and uses the same data structures, facilitating its use.
> It's adapted from the algorithm used to process URLs in connect.c,
> so it should support the same inputs.
> 
> Signed-off-by: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com>
> ---
>  urlmatch.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
>  urlmatch.h |  1 +
>  2 files changed, 91 insertions(+)
> 
> diff --git a/urlmatch.c b/urlmatch.c
> index 1d0254abacb..5a442e31fa2 100644
> --- a/urlmatch.c
> +++ b/urlmatch.c
> @@ -3,6 +3,7 @@
>  #include "hex-ll.h"
>  #include "strbuf.h"
>  #include "urlmatch.h"
> +#include "url.h"
>  
>  #define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
>  #define URL_DIGIT "0123456789"
> @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info)
>  	return url_normalize_1(url, out_info, 0);
>  }
>  
> +enum protocol {
> +	PROTO_UNKNOWN = 0,
> +	PROTO_LOCAL,
> +	PROTO_FILE,
> +	PROTO_SSH,
> +	PROTO_GIT,
> +};
> +
> +static enum protocol url_get_protocol(const char *name, size_t n)
> +{
> +	if (!strncmp(name, "ssh", n))
> +		return PROTO_SSH;
> +	if (!strncmp(name, "git", n))
> +		return PROTO_GIT;
> +	if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */
> +		return PROTO_SSH;
> +	if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */
> +		return PROTO_SSH;
> +	if (!strncmp(name, "file", n))
> +		return PROTO_FILE;
> +	return PROTO_UNKNOWN;
> +}
> +
> +char *url_parse(const char *url_orig, struct url_info *out_info)
> +{
> +	struct strbuf url;
> +	char *host, *separator;
> +	char *detached, *normalized;
> +	enum protocol protocol = PROTO_LOCAL;
> +	struct url_info local_info;
> +	struct url_info *info = out_info? out_info : &local_info;
> +	bool scp_syntax = false;
> +
> +	if (is_url(url_orig)) {
> +		url_orig = url_decode(url_orig);
> +	} else {
> +		url_orig = xstrdup(url_orig);
> +	}
> +
> +	strbuf_init(&url, strlen(url_orig) + sizeof("ssh://"));
> +	strbuf_addstr(&url, url_orig);
> +
> +	host = strstr(url.buf, "://");
> +	if (host) {
> +		protocol = url_get_protocol(url.buf, host - url.buf);
> +		host += 3;
> +	} else {
> +		if (!url_is_local_not_ssh(url.buf)) {
> +			scp_syntax = true;
> +			protocol = PROTO_SSH;
> +			strbuf_insertstr(&url, 0, "ssh://");
> +			host = url.buf + 6;
> +		}
> +	}
Interesting. 

    `
    $ ./git url-parse -c protocol file:/test/test
    ssh
    `

seems like only having a single slash after the 'protocol:' prints
'ssh' always (I think this may not even be a valid url). After this 'else'
block, the url turns into 'ssh://file/test/test'. Will examine the details
later. Not that it's your code's doing, and rather the result of
url_is_local_not_ssh(). But just wanted to point this out and ask if this
should error out or is this an intended behavior that I can't figure out. 

Thanks.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On the Git mailing list, Torsten Bögershausen wrote (reply to this):

[]
> Interesting.
>
>     `
>     $ ./git url-parse -c protocol file:/test/test
>     ssh
>     `
>
> seems like only having a single slash after the 'protocol:' prints
> 'ssh' always (I think this may not even be a valid url). After this 'else'
> block, the url turns into 'ssh://file/test/test'. Will examine the details
> later. Not that it's your code's doing, and rather the result of
> url_is_local_not_ssh(). But just wanted to point this out and ask if this
> should error out or is this an intended behavior that I can't figure out.

ssh is the correct answer, try something like

`git clone localhost:/home/myself/project/git.git`

It is the scp syntax, supported by Git as well.
From `man scp`

    scp copies files between hosts on a network.
    []
    The source and target may be specified as a local pathname,
    a remote host with optional path in the form
    [user@]host:[path],
    or a URI in the form scp://[user@]host[:port][/path].
    Local file names can be made explicit using absolute or relative pathnames
    to avoid scp treating file names containing ‘:’ as host specifiers.

So yes, they share similar problems
with the ':' that could mean different things when using the short form.

#include "strbuf.h"
#include "urlmatch.h"
#include "url.h"

#define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz"
#define URL_DIGIT "0123456789"
Expand Down Expand Up @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info)
return url_normalize_1(url, out_info, 0);
}

enum protocol {
PROTO_UNKNOWN = 0,
PROTO_LOCAL,
PROTO_FILE,
PROTO_SSH,
PROTO_GIT,
};

static enum protocol url_get_protocol(const char *name, size_t n)
{
if (!strncmp(name, "ssh", n))
return PROTO_SSH;
if (!strncmp(name, "git", n))
return PROTO_GIT;
if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */
return PROTO_SSH;
if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */
return PROTO_SSH;
if (!strncmp(name, "file", n))
return PROTO_FILE;
return PROTO_UNKNOWN;
}

char *url_parse(const char *url_orig, struct url_info *out_info)
{
struct strbuf url;
char *host, *separator;
char *detached, *normalized;
enum protocol protocol = PROTO_LOCAL;
struct url_info local_info;
struct url_info *info = out_info? out_info : &local_info;
bool scp_syntax = false;

if (is_url(url_orig)) {
url_orig = url_decode(url_orig);
} else {
url_orig = xstrdup(url_orig);
}

strbuf_init(&url, strlen(url_orig) + sizeof("ssh://"));
strbuf_addstr(&url, url_orig);

host = strstr(url.buf, "://");
if (host) {
protocol = url_get_protocol(url.buf, host - url.buf);
host += 3;
} else {
if (!url_is_local_not_ssh(url.buf)) {
scp_syntax = true;
protocol = PROTO_SSH;
strbuf_insertstr(&url, 0, "ssh://");
host = url.buf + 6;
}
}

/* path starts after ':' in scp style SSH URLs */
if (scp_syntax) {
separator = strchr(host, ':');
if (separator) {
if (separator[1] == '/')
strbuf_remove(&url, separator - url.buf, 1);
else
*separator = '/';
}
}

detached = strbuf_detach(&url, NULL);
normalized = url_normalize(detached, info);
free(detached);

if (!normalized) {
return NULL;
}

/* point path to ~ for URL's like this:
*
* ssh://host.xz/~user/repo
* git://host.xz/~user/repo
* host.xz:~user/repo
*
*/
if (protocol == PROTO_GIT || protocol == PROTO_SSH) {
if (normalized[info->path_off + 1] == '~')
info->path_off++;
}

return normalized;
}

static size_t url_match_prefix(const char *url,
const char *url_prefix,
size_t url_prefix_len)
Expand Down
1 change: 1 addition & 0 deletions urlmatch.h
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ struct url_info {
};

char *url_normalize(const char *, struct url_info *);
char *url_parse(const char *, struct url_info *);

struct urlmatch_item {
size_t hostmatch_len;
Expand Down